"""
PyDetex
https://github.com/ppizarror/PyDetex
UTILS TEX
Latex utils.
"""
__all__ = [
'apply_tag_between_inside_char_command',
'apply_tag_tex_commands',
'apply_tag_tex_commands_no_argv',
'find_tex_command_char',
'find_tex_commands',
'find_tex_commands_noargv',
'find_tex_environments',
'get_tex_commands_args',
'TEX_COMMAND_CHARS',
'TEX_EQUATION_CHARS',
'tex_to_unicode'
]
import flatlatex
import os
import re
from flatlatex.parser import LatexSyntaxError
from typing import Tuple, Union, List, Dict, Optional, Any
# Flat latex object
_FLATLATEX = flatlatex.converter(ignore_newlines=False, keep_spaces=True)
# Tex to unicode
_TEX_TO_UNICODE: Dict[str, Union[Dict[Any, str], List[Tuple[str, str]]]] = {
'latex_symbols': [],
'subscripts': {},
'superscripts': {},
'textbb': {},
'textbf': {},
'textcal': {},
'textfrak': {},
'textit': {},
'textmono': {}
}
# Valid command chars
TEX_COMMAND_CHARS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z', '*', '@']
TEX_EQUATION_CHARS = [
('$', '$', True),
(r'\(', r'\)', False),
(r'\[', r'\]', False),
('\\begin{align*}', '\\end{align*}', False),
('\\begin{align}', '\\end{align}', False),
('\\begin{displaymath}', '\\end{displaymath}', False),
('\\begin{equation*}', '\\end{equation*}', False),
('\\begin{equation}', '\\end{equation}', False),
('\\begin{gather*}', '\\end{gather*}', False),
('\\begin{gather}', '\\end{gather}', False),
('\\begin{math}', '\\end{math}', False)
]
[docs]def find_tex_command_char(
s: str,
symbols_char: List[Tuple[str, str, bool]],
) -> Tuple[Tuple[int, int, int, int], ...]:
"""
Find symbols command positions.
Example:
.. code-block:: none
00000000001111111111....
01234567890123456789....
Input: This is a $formula$ and this is not.
Output: ((10, 11, 17, 18), ...)
:param s: Latex string code
:param symbols_char: Symbols to check ``[(initial, final, ignore escape), ...]``
:return: Positions
"""
assert isinstance(symbols_char, list)
max_len = 0
for j in symbols_char:
assert len(j) == 3, f'Format is (initial, final, ignore escape); but received {j}'
assert isinstance(j[0], str) and len(j[0]) > 0 and ' ' not in j[0]
assert isinstance(j[1], str) and len(j[1]) > 0 and ' ' not in j[1]
assert isinstance(j[2], bool)
max_len = max(max_len, len(j[0]), len(j[1]))
def _find(k: int, y: int, p: bool = True) -> bool:
"""
Returns true if from k char (in s) the symbols-char-y element is present.
:param k: Position to start
:param y: Indes of the symbol within the list
:param p: Reads the first (True) or last element
:return: True if exist
"""
if y < 0:
return False
n, m, ignore_escape = symbols_char[y]
nm = n if p else m
total = 0
for z in range(len(nm)):
if s[k + z] == nm[z] and (z == 0 and (not ignore_escape or ignore_escape and s[k - 1] != '\\') or z > 0):
total += 1
return total == len(nm)
def _find_initial(k: int) -> int:
"""
Find which symbol is contained.
:param k: Position to start from
:return: The index of the symbol within the list
"""
for y in range(len(symbols_char)):
if _find(k, y):
return y
return -1
s = '_' + s + ' ' * max_len
r = False # Inside tag
r_u = -1
a = 0
found = []
for i in range(1, len(s) - max_len):
u = _find_initial(i)
v = _find(i, r_u, False)
# Open tag
if not r and u >= 0:
a = i
r = True
r_u = u
# Close
elif r and v:
r = False
f, g = a - 1, i - 1
found.append((f, f + len(symbols_char[r_u][0]), g - 1, g + len(symbols_char[r_u][1]) - 1))
return tuple(found)
[docs]def apply_tag_between_inside_char_command(
s: str,
symbols_char: List[Tuple[str, str, bool]],
tags: Union[Tuple[str, str, str, str], str]
) -> str:
"""
Apply tag between symbols.
For example, if symbols are ``($, $)`` and tag is ``[1,2,3,4]``:
.. code-block:: none
Input: This is a $formula$ and this is not.
Output: This is a 1$2formula3$4 and this is not
:param s: Latex string code
:param symbols_char: ``[(initial, final, ignore escape), ...]``
:param tags: Tags to replace
:return: String with tags
"""
if isinstance(tags, str):
if tags == '':
return s
tags = (tags, tags, tags, tags)
assert len(tags) == 4
a, b, c, d = tags
tex_tags = find_tex_command_char(s, symbols_char)
if len(tex_tags) == 0:
return s
new_s = ''
k = 0 # Moves through tags
for i in range(len(s)):
if k < len(tex_tags):
if i == tex_tags[k][0]:
new_s += a + s[i]
continue
elif tex_tags[k][0] < i < tex_tags[k][1]:
pass
elif i == tex_tags[k][1] and tex_tags[k][1] != tex_tags[k][3]:
new_s += b + s[i]
if tex_tags[k][2] - tex_tags[k][1] == 0:
new_s += c
continue
elif i == tex_tags[k][2] and tex_tags[k][2] != tex_tags[k][0]:
new_s += s[i] + c
continue
elif tex_tags[k][2] < i < tex_tags[k][3]:
pass
elif i == tex_tags[k][3]:
new_s += s[i] + d
k += 1
continue
new_s += s[i]
return new_s
[docs]def find_tex_commands(s: str, offset: int = 0) -> Tuple[Tuple[int, int, int, int, bool], ...]:
"""
Find all tex commands within a code.
.. code-block:: none
00000000001111111111222
01234567890123456789012
a b c d
Example: This is \\aCommand{nice}...
Output: ((8, 16, 18, 21), ...)
:param s: Latex string code
:param offset: Offset added to the positioning, useful when using recursive calling on substrings
:return: Tuple if found codes ``(a, b, c, d, command continues)``
"""
found: List = []
is_cmd = False
is_argv = False
s += '_'
a, b, c0, c1, d = 0, -1, 0, 0, 0
depth_0 = 0 # {}
depth_1 = 0 # []
cont_chars = ('{', '[', ' ', '\n')
cmd_idx = 0 # index
mode_arg = -1
for i in range(len(s) - 1):
# Start a command
if not is_cmd and s[i] == '\\' and s[i + 1] in TEX_COMMAND_CHARS:
a, b, is_cmd, is_argv = i, -1, True, False
cmd_idx += 1
mode_arg = -1
depth_0, depth_1 = 0, 0
# If command before args encounter an invalid chad, disables the command
elif is_cmd and not is_argv and s[i] not in cont_chars and s[i] not in TEX_COMMAND_CHARS:
is_cmd = False
if s[i] == '\\' and s[i + 1] in TEX_COMMAND_CHARS:
a, b, is_cmd, is_argv = i, -1, True, False
cmd_idx += 1
# If command has a new line, but following chars are not space
elif is_cmd and not is_argv and s[i] == '\n' and s[i + 1] in TEX_COMMAND_CHARS:
is_cmd = False
# If command, not arg, but an invalid char follows the space, disables the command
elif is_cmd and not is_argv and s[i - 1] == ' ' and s[i] not in cont_chars:
is_cmd = False
# Inits a new arg
elif is_cmd and s[i] in ('{', '[') and s[i - 1] != '\\':
is_argv = True
if b == -1:
b = i - 1
depth_0, depth_1 = 0, 0
if s[i] == '{':
if depth_0 == 0:
c0 = i + 1
if mode_arg < 0:
mode_arg = 0
depth_0 += 1
else:
if depth_1 == 0:
c1 = i + 1
if mode_arg < 0:
mode_arg = 1
depth_1 += 1
# Ends the argument, only if depth condition satisfies
elif is_cmd and is_argv and s[i] in ('}', ']') and s[i - 1] != '\\':
if s[i] == '}':
depth_0 -= 1
else: # ]
depth_1 -= 1
if (depth_0 == 0 and mode_arg == 0) or (depth_1 == 0 and mode_arg == 1): # Finished
d = i - 1
found.append([a, b, c0 if s[i] == '}' else c1, d, cmd_idx])
if s[i + 1] not in cont_chars:
is_cmd = False
is_argv = False
mode_arg = -1
# elif depth_0 < 0 or depth_1 < 0: # Invalid argument (parenthesis imbalance)
# is_cmd = False
# is_argv = False
# mode_arg = -1
# Add the offsets
for f in found:
f[0] += offset
f[1] += offset
f[2] += offset
f[3] += offset
# Check if command continues
if len(found) == 0:
return ()
elif len(found) == 1:
found[0][4] = False
else:
for k in range(1, len(found)):
if found[k][4] == found[k - 1][4]:
found[k - 1][4] = True
else:
found[k - 1][4] = False
if k == len(found) - 1:
found[k][4] = False
for k in range(len(found)):
# noinspection PyUnresolvedReferences
found[k] = tuple(found[k])
return tuple(found)
def _find_tex_env_recursive(original_s: str, s: str, offset: int = 0, depth: int = 0) -> List:
"""
Find all environments.
:param s: Latex string code
:param offset: Offset applied to the search
:return: Tuple of all commands
"""
tags = find_tex_commands(s, offset=offset)
new_tags = []
for t in tags:
a, b, c, d, _ = t
source_cmd = s[a - offset:b - offset + 1]
if 'begin' not in source_cmd and 'end' not in source_cmd:
# Get the arguments of the command, and check more environments there
cmd_args = s[c - offset:d - offset + 1]
if 'begin' in cmd_args or 'end' in cmd_args:
if 'newenvironment' in source_cmd or 'newcommand' in source_cmd: # Prone to bugs
continue
for tr in _find_tex_env_recursive(original_s, cmd_args, offset=c, depth=depth + 1):
new_tags.append(tr)
else:
new_tags.append(t)
return new_tags
[docs]def find_tex_environments(s: str) -> Tuple[Tuple[str, int, int, int, int, str, int, int], ...]:
r"""
Find all tex commands within a code.
Example:
.. code-block:: none
0000000000111111111122222222223333333333
0123456789012345678901234567890123456789
a b c d
Example: This is \begin{nice}[cmd]my...\end{nice}
Output: (('nice', 8, 20, 29, 39, 'parentenv', 0, -1), ...)
This method also returns the name of the parent environment, the depth of the
environment, and the depth of the item enviroment (if itemizable).
:param s: Latex string code
:return: Tuple if found environment ``(env_name, a, b, c, d, parent_env_name, env_depth, env_item_depth)``
"""
def _env_common(e: str) -> str:
"""
Return the common environment for a given name.
:param e: Environment name
:return: Common environment
"""
if ('itemize' in e) or ('enumerate' in e) or ('tablenotes' in e):
return 'item_'
return ''
tags = _find_tex_env_recursive(s, s)
envs = []
env: Dict[str, List[Tuple[int, int, str, int]]] = {}
last_env = ''
env_depth = 0
cmds_cont = []
env_depths: Dict[str, int] = {}
for t in tags:
a, b, c, d, _ = t
if 'begin' in s[a:b + 1]:
env_name = s[c:d + 1]
c_env_name = _env_common(env_name) # Common environment name
if c_env_name not in env_depths.keys():
env_depths[c_env_name] = 0
else:
env_depths[c_env_name] += 1
env_i = (a, d + 2, last_env, env_depth)
if env_name not in env:
env[env_name] = [env_i]
else:
env[env_name].append(env_i)
if a not in cmds_cont:
cmds_cont.append(a)
last_env = env_name
env_depth += 1
elif 'end' in s[a:b + 1]:
env_name = s[c:d + 1]
c_env_name = _env_common(env_name) # Common environment name
if env_name in env.keys():
env_i = env[env_name].pop()
# Update env itemize depth
env_depth_item = -1
if c_env_name != '':
env_depth_item = env_depths[c_env_name]
env_depths[c_env_name] -= 1
envs.append((
env_name, # Environment name
env_i[0], # a-position of the env
env_i[1], # b-position
a, # c-position
d, # d-position
env_i[2], # parent environment name
env_i[3], # depth of the environment
env_depth_item # itemize depth
))
if len(env[env_name]) == 0:
del env[env_name]
last_env = env_i[2]
env_depth -= 1
return tuple(envs)
[docs]def get_tex_commands_args(
s: str,
pos: bool = False
) -> Tuple[Tuple[Union[str, Tuple[str, bool], Tuple[int, int]], ...], ...]:
r"""
Get all the arguments from a tex command. Each command argument has a boolean
indicating if that is optional or not.
.. code-block:: none
Example: This is \aCommand[\label{}]{nice} and...
Output: (('aCommand', ('\label{}', True), ('nice', False)), ...)
:param s: Latex string code
:param pos: Add the numerical position of the original string at the last position
:return: Arguments
"""
tags = find_tex_commands(s)
commands = []
command = []
for t in tags:
a, b, c, d, cont = t
if len(command) == 0:
command.append(s[a + 1:b + 1].strip())
arg = s[c - 1:d + 2]
command.append((arg[1:-1], len(arg) != 0 and arg[0] == '['))
if not cont:
if pos:
command.append((a, d + 2))
commands.append(tuple(command))
command = []
return tuple(commands)
[docs]def find_tex_commands_noargv(s: str) -> Tuple[Tuple[int, int], ...]:
"""
Find all tex commands with no arguments within a code.
.. code-block:: none
00000000001111111111222
01234567890123456789012
x x
Example: This is \aCommand ...
Output: ((8,16), ...)
:param s: Latex string code
:return: Tuple if found codes
"""
found = []
is_cmd = False
s += '_'
a = 0
cont_chars = ('{', '[', ' ')
for i in range(len(s) - 1):
if not is_cmd and s[i] == '\\' and s[i + 1] in TEX_COMMAND_CHARS:
if i > 0 and s[i - 1] == '⇲':
continue
a = i
is_cmd = True
elif is_cmd and s[i] == '\\':
if i - 1 - a > 0:
found.append([a, i - 1])
a = i
elif is_cmd and s[i] in ('{', '['):
is_cmd = False
# If command, not arg, but an invalid char follows the space, disables the command
elif is_cmd and s[i - 1] == ' ' and s[i] not in cont_chars:
is_cmd = False
found.append([a, i - 1])
elif is_cmd and s[i] not in TEX_COMMAND_CHARS and s[i] not in cont_chars:
is_cmd = False
found.append([a, i - 1])
if is_cmd and a != len(s) - 2:
found.append([a, len(s) - 2])
# Strip chars
for k in range(len(found)):
ch = found[k][1]
for j in range(ch):
if s[found[k][1]] == ' ':
found[k][1] -= 1
else:
break
# noinspection PyUnresolvedReferences
found[k] = tuple(found[k])
# noinspection PyTypeChecker
return tuple(found)
[docs]def apply_tag_tex_commands(
s: str,
tags: Union[Tuple[str, str, str, str, str], str]
) -> str:
"""
Apply tag to tex command.
For example, if tag is ``[1,2,3,4,5]``:
.. code-block:: none
Input: This is a \\formula{epic} and this is not
Output: This is a 1\\formula2{3epic4}5 and this is not
:param s: Latex string code
:param tags: Tags (length 5)
:return: Code with tags
"""
if isinstance(tags, str):
if tags == '':
return s
tags = (tags, tags, tags, tags, tags)
assert len(tags) == 5
a, b, c, d, e = tags # Unpack
tex_tags = find_tex_commands(s)
if len(tex_tags) == 0:
return s
new_s = ''
k = 0 # Moves through tags
i = -1
for _ in range(len(s)):
i += 1
if i == len(s):
break
if k < len(tex_tags) and i in tex_tags[k][0:4]:
if i == tex_tags[k][0]:
new_s += a + s[i]
elif i == tex_tags[k][1]:
new_s += s[i] + b
elif i == tex_tags[k][2] and i != tex_tags[k][3]:
new_s += c + s[i]
elif i == tex_tags[k][3]:
if i == tex_tags[k][2]:
new_s += c
new_s += s[i] + d + s[i + 1] + e
i += 1
# if continues
if tex_tags[k][4]:
new_s += b
k += 1
else:
new_s += s[i]
return new_s[0:len(new_s)]
[docs]def apply_tag_tex_commands_no_argv(
s: str,
tags: Union[Tuple[str, str], str]
) -> str:
"""
Apply tag to tex command.
For example, if tag is ``[1,2]``:
.. code-block:: none
Input: This is a \\formula and this is not.
Output: This is a 1\\formula2 and this is not
:param s: Latex string code
:param tags: Tags (length 5)
:return: Code with tags
"""
if isinstance(tags, str):
if tags == '':
return s
tags = (tags, tags)
assert len(tags) == 2
a, b = tags # Unpack
tex_tags = find_tex_commands_noargv(s)
if len(tex_tags) == 0:
return s
new_s = ''
k = 0 # Moves through tags
i = -1
for _ in range(len(s)):
i += 1
if k < len(tex_tags) and i in tex_tags[k]:
if i == tex_tags[k][0]:
new_s += a + s[i]
elif i == tex_tags[k][1]:
new_s += s[i] + b
k += 1
else:
new_s += s[i]
return new_s
def _convert_single_symbol(s: str) -> Optional[str]:
"""
If ``s`` is just a latex code ``'alpha'`` or ``'beta'`` it converts it to its
unicode representation.
:param s: Latex string code
:return: Latex with converted single symbols
"""
if '\\' not in s[0]:
s = '\\' + s
for (code, val) in _TEX_TO_UNICODE['latex_symbols']:
if code == s:
return val
return None
def _convert_latex_symbols(s: str) -> str:
"""
Replace each ``'\alpha'``, ``'\beta'`` and similar latex symbols with
their unicode representation.
:param s: Latex string code
:return: Replaced symbols
"""
for (code, val) in _TEX_TO_UNICODE['latex_symbols']:
s = s.replace(code, val)
return s
def _process_starting_modifiers(s: str) -> str:
"""
If s start with ``'it '``, ``'cal '``, etc. then make the whole string
italic, calligraphic, etc.
:param s: Latex string code
:return: Modified text
"""
s = re.sub('^bb ', r'\\bb{', s)
s = re.sub('^bf ', r'\\bf{', s)
s = re.sub('^it ', r'\\it{', s)
s = re.sub('^cal ', r'\\cal{', s)
s = re.sub('^frak ', r'\\frak{', s)
s = re.sub('^mono ', r'\\mono{', s)
return s
def _apply_all_modifiers(s: str) -> str:
"""
Applies all modifiers.
:param s: Latex string code
:return: Text with replaced chars
"""
s = _apply_modifier(s, '^', _TEX_TO_UNICODE['superscripts'])
s = _apply_modifier(s, '_', _TEX_TO_UNICODE['subscripts'])
s = _apply_modifier(s, '\\bb', _TEX_TO_UNICODE['textbb'])
s = _apply_modifier(s, '\\bf', _TEX_TO_UNICODE['textbf'])
s = _apply_modifier(s, '\\cal', _TEX_TO_UNICODE['textcal'])
s = _apply_modifier(s, '\\emph', _TEX_TO_UNICODE['textit'])
s = _apply_modifier(s, '\\frak', _TEX_TO_UNICODE['textfrak'])
s = _apply_modifier(s, '\\it', _TEX_TO_UNICODE['textit'])
s = _apply_modifier(s, '\\mono', _TEX_TO_UNICODE['textmono'])
return s
def _apply_modifier(s: str, modifier: str, d: Dict[Any, str]) -> str:
"""
This will search for the ^ signs and replace the next
digit or (digits when {} is used) with its/their uppercase representation.
:param s: Latex string code
:param modifier: Modifier command
:param d: Dict to look upon
:return: New text with replaced text.
"""
s = s.replace(modifier, "^")
newtext = ""
mode_normal, mode_modified, mode_long = range(3)
mode = mode_normal
for ch in s:
if mode == mode_normal and ch == '^':
mode = mode_modified
continue
elif mode == mode_modified and ch == '{':
mode = mode_long
continue
elif mode == mode_modified:
newtext += d.get(ch, ch)
mode = mode_normal
continue
elif mode == mode_long and ch == '}':
mode = mode_normal
continue
if mode == mode_normal:
newtext += ch
else:
newtext += d.get(ch, ch)
return newtext
def __load_unicode() -> None:
"""
Loads the unicode data.
"""
respath = str(os.path.abspath(os.path.dirname(__file__))).replace('\\', '/') + '/res/u_'
for j in _TEX_TO_UNICODE.keys():
if j == 'latex_symbols':
with open(f'{respath}symbols.txt', encoding='utf-8') as f:
line = f.readline()
while line != "":
words = line.split()
code = words[0]
val = words[1]
_TEX_TO_UNICODE['latex_symbols'].append((code, val))
line = f.readline()
else:
with open(f'{respath}{j}.txt', encoding='utf-8') as f:
line = f.readline()
while line != '':
words = line.split()
code = words[0]
val = words[1]
_TEX_TO_UNICODE[j][code] = val
line = f.readline()
[docs]def tex_to_unicode(s: str) -> str:
"""
Transforms tex code to unicode.
:param s: Latex string code
:return: Text in unicode
"""
if s.strip() == '':
return s
ss = _convert_single_symbol(s)
if ss is not None:
return ss
s = _convert_latex_symbols(s)
s = _process_starting_modifiers(s)
s = _apply_all_modifiers(s)
# Last filter
s = s.replace('\n\n', '\n').replace(' ', ' ').replace('\t', ' ')
try:
s = _FLATLATEX.convert(s)
except LatexSyntaxError:
pass
return s
# Loads the unicode data
__load_unicode()