Source code for pydetex._utils_tex

"""
PyDetex
https://github.com/ppizarror/PyDetex

UTILS TEX
Latex utils.
"""

__all__ = [
    'apply_tag_between_inside_char_command',
    'apply_tag_tex_commands',
    'apply_tag_tex_commands_no_argv',
    'find_tex_command_char',
    'find_tex_commands',
    'find_tex_commands_noargv',
    'find_tex_environments',
    'get_tex_commands_args',
    'TEX_COMMAND_CHARS',
    'TEX_EQUATION_CHARS',
    'tex_to_unicode'
]

import flatlatex
import os
import re

from flatlatex.parser import LatexSyntaxError
from typing import Tuple, Union, List, Dict, Optional, Any

# Flat latex object
_FLATLATEX = flatlatex.converter(ignore_newlines=False, keep_spaces=True)

# Tex to unicode
_TEX_TO_UNICODE: Dict[str, Union[Dict[Any, str], List[Tuple[str, str]]]] = {
    'latex_symbols': [],
    'subscripts': {},
    'superscripts': {},
    'textbb': {},
    'textbf': {},
    'textcal': {},
    'textfrak': {},
    'textit': {},
    'textmono': {}
}

# Valid command chars
TEX_COMMAND_CHARS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                     'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                     'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
                     'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
                     'W', 'X', 'Y', 'Z', '*', '@']
TEX_EQUATION_CHARS = [
    ('$', '$', True),
    ('\(', '\)', False),
    ('\[', '\]', False),
    ('\\begin{align*}', '\end{align*}', False),
    ('\\begin{align}', '\end{align}', False),
    ('\\begin{displaymath}', '\end{displaymath}', False),
    ('\\begin{equation*}', '\\end{equation*}', False),
    ('\\begin{equation}', '\\end{equation}', False),
    ('\\begin{gather*}', '\\end{gather*}', False),
    ('\\begin{gather}', '\\end{gather}', False),
    ('\\begin{math}', '\end{math}', False)
]


[docs]def find_tex_command_char(
    s: str,
    symbols_char: List[Tuple[str, str, bool]],
) -> Tuple[Tuple[int, int, int, int], ...]:
    """
    Find symbols command positions.

    Example:

    .. code-block:: none

               00000000001111111111....
               01234567890123456789....
        Input: This is a $formula$ and this is not.
        Output: ((10, 11, 17, 18), ...)

    :param s: Latex string code
    :param symbols_char: Symbols to check ``[(initial, final, ignore escape), ...]``
    :return: Positions
    """
    assert isinstance(symbols_char, list)
    max_len = 0
    for j in symbols_char:
        assert len(j) == 3, f'Format is (initial, final, ignore escape); but received {j}'
        assert isinstance(j[0], str) and len(j[0]) > 0 and ' ' not in j[0]
        assert isinstance(j[1], str) and len(j[1]) > 0 and ' ' not in j[1]
        assert isinstance(j[2], bool)
        max_len = max(max_len, len(j[0]), len(j[1]))

    def _find(k: int, y: int, p: bool = True) -> bool:
        """
        Returns true if from k char (in s) the symbols-char-y element is present.

        :param k: Position to start
        :param y: Indes of the symbol within the list
        :param p: Reads the first (True) or last element
        :return: True if exist
        """
        if y < 0:
            return False
        n, m, ignore_escape = symbols_char[y]
        nm = n if p else m
        total = 0
        for z in range(len(nm)):
            if s[k + z] == nm[z] and (z == 0 and (not ignore_escape or ignore_escape and s[k - 1] != '\\') or z > 0):
                total += 1
        return total == len(nm)

    def _find_initial(k: int) -> int:
        """
        Find which symbol is contained.

        :param k: Position to start from
        :return: The index of the symbol within the list
        """
        for y in range(len(symbols_char)):
            if _find(k, y):
                return y
        return -1

    s = '_' + s + ' ' * max_len
    r = False  # Inside tag
    r_u = -1
    a = 0
    found = []

    for i in range(1, len(s) - max_len):
        u = _find_initial(i)
        v = _find(i, r_u, False)
        # Open tag
        if not r and u >= 0:
            a = i
            r = True
            r_u = u
        # Close
        elif r and v:
            r = False
            f, g = a - 1, i - 1
            found.append((f, f + len(symbols_char[r_u][0]), g - 1, g + len(symbols_char[r_u][1]) - 1))

    return tuple(found)


[docs]def apply_tag_between_inside_char_command(
    s: str,
    symbols_char: List[Tuple[str, str, bool]],
    tags: Union[Tuple[str, str, str, str], str]
) -> str:
    """
    Apply tag between symbols.

    For example, if symbols are ``($, $)`` and tag is ``[1,2,3,4]``:

    .. code-block:: none

        Input: This is a $formula$ and this is not.
        Output: This is a 1$2formula3$4 and this is not

    :param s: Latex string code
    :param symbols_char:  ``[(initial, final, ignore escape), ...]``
    :param tags: Tags to replace
    :return: String with tags
    """
    if isinstance(tags, str):
        if tags == '':
            return s
        tags = (tags, tags, tags, tags)

    assert len(tags) == 4
    a, b, c, d = tags
    tex_tags = find_tex_command_char(s, symbols_char)

    if len(tex_tags) == 0:
        return s
    new_s = ''
    k = 0  # Moves through tags
    for i in range(len(s)):
        if k < len(tex_tags):
            if i == tex_tags[k][0]:
                new_s += a + s[i]
                continue
            elif tex_tags[k][0] < i < tex_tags[k][1]:
                pass
            elif i == tex_tags[k][1] and tex_tags[k][1] != tex_tags[k][3]:
                new_s += b + s[i]
                if tex_tags[k][2] - tex_tags[k][1] == 0:
                    new_s += c
                continue
            elif i == tex_tags[k][2] and tex_tags[k][2] != tex_tags[k][0]:
                new_s += s[i] + c
                continue
            elif tex_tags[k][2] < i < tex_tags[k][3]:
                pass
            elif i == tex_tags[k][3]:
                new_s += s[i] + d
                k += 1
                continue
        new_s += s[i]

    return new_s


[docs]def find_tex_commands(s: str, offset: int = 0) -> Tuple[Tuple[int, int, int, int, bool], ...]:
    """
    Find all tex commands within a code.

    .. code-block:: none

                 00000000001111111111222
                 01234567890123456789012
                         a        b c  d
        Example: This is \\aCommand{nice}...
        Output: ((8, 16, 18, 21), ...)

    :param s: Latex string code
    :param offset: Offset added to the positioning, useful when using recursive calling on substrings
    :return: Tuple if found codes ``(a, b, c, d, command continues)``
    """
    found: List = []
    is_cmd = False
    is_argv = False
    s += '_'
    a, b, c0, c1, d = 0, -1, 0, 0, 0
    depth_0 = 0  # {}
    depth_1 = 0  # []
    cont_chars = ('{', '[', ' ', '\n')
    cmd_idx = 0  # index
    mode_arg = -1

    for i in range(len(s) - 1):
        # Start a command
        if not is_cmd and s[i] == '\\' and s[i + 1] in TEX_COMMAND_CHARS:
            a, b, is_cmd, is_argv = i, -1, True, False
            cmd_idx += 1
            mode_arg = -1
            depth_0, depth_1 = 0, 0

        # If command before args encounter an invalid chad, disables the command
        elif is_cmd and not is_argv and s[i] not in cont_chars and s[i] not in TEX_COMMAND_CHARS:
            is_cmd = False
            if s[i] == '\\' and s[i + 1] in TEX_COMMAND_CHARS:
                a, b, is_cmd, is_argv = i, -1, True, False
                cmd_idx += 1

        # If command has a new line, but following chars are not space
        elif is_cmd and not is_argv and s[i] == '\n' and s[i + 1] in TEX_COMMAND_CHARS:
            is_cmd = False

        # If command, not arg, but an invalid char follows the space, disables the command
        elif is_cmd and not is_argv and s[i - 1] == ' ' and s[i] not in cont_chars:
            is_cmd = False

        # Inits a new arg
        elif is_cmd and s[i] in ('{', '[') and s[i - 1] != '\\':
            is_argv = True
            if b == -1:
                b = i - 1
                depth_0, depth_1 = 0, 0
            if s[i] == '{':
                if depth_0 == 0:
                    c0 = i + 1
                    if mode_arg < 0:
                        mode_arg = 0
                depth_0 += 1
            else:
                if depth_1 == 0:
                    c1 = i + 1
                    if mode_arg < 0:
                        mode_arg = 1
                depth_1 += 1

        # Ends the argument, only if depth condition satisfies
        elif is_cmd and is_argv and s[i] in ('}', ']') and s[i - 1] != '\\':
            if s[i] == '}':
                depth_0 -= 1
            else:  # ]
                depth_1 -= 1

            if (depth_0 == 0 and mode_arg == 0) or (depth_1 == 0 and mode_arg == 1):  # Finished
                d = i - 1
                found.append([a, b, c0 if s[i] == '}' else c1, d, cmd_idx])
                if s[i + 1] not in cont_chars:
                    is_cmd = False
                is_argv = False
                mode_arg = -1
            # elif depth_0 < 0 or depth_1 < 0:  # Invalid argument (parenthesis imbalance)
            #     is_cmd = False
            #     is_argv = False
            # mode_arg = -1

    # Add the offsets
    for f in found:
        f[0] += offset
        f[1] += offset
        f[2] += offset
        f[3] += offset

    # Check if command continues
    if len(found) == 0:
        return ()
    elif len(found) == 1:
        found[0][4] = False
    else:
        for k in range(1, len(found)):
            if found[k][4] == found[k - 1][4]:
                found[k - 1][4] = True
            else:
                found[k - 1][4] = False
            if k == len(found) - 1:
                found[k][4] = False
    for k in range(len(found)):
        # noinspection PyUnresolvedReferences
        found[k] = tuple(found[k])

    return tuple(found)


def _find_tex_env_recursive(original_s: str, s: str, offset: int = 0, depth: int = 0) -> List:
    """
    Find all environments.

    :param s: Latex string code
    :param offset: Offset applied to the search
    :return: Tuple of all commands
    """
    tags = find_tex_commands(s, offset=offset)
    new_tags = []
    for t in tags:
        a, b, c, d, _ = t
        source_cmd = s[a - offset:b - offset + 1]
        if 'begin' not in source_cmd and 'end' not in source_cmd:
            # Get the arguments of the command, and check more environments there
            cmd_args = s[c - offset:d - offset + 1]
            if 'begin' in cmd_args or 'end' in cmd_args:
                if 'newenvironment' in source_cmd or 'newcommand' in source_cmd:  # Prone to bugs
                    continue
                for tr in _find_tex_env_recursive(original_s, cmd_args, offset=c, depth=depth + 1):
                    new_tags.append(tr)
        else:
            new_tags.append(t)
    return new_tags


[docs]def find_tex_environments(s: str) -> Tuple[Tuple[str, int, int, int, int, str, int, int], ...]:
    """
    Find all tex commands within a code.

    Example:

    .. code-block:: none

                 0000000000111111111122222222223333333333
                 0123456789012345678901234567890123456789
                         a           b        c         d
        Example: This is \begin{nice}[cmd]my...\end{nice}
        Output: (('nice', 8, 20, 29, 39, 'parentenv', 0, -1), ...)

    This method also returns the name of the parent environment, the depth of the
    environment, and the depth of the item enviroment (if itemizable).

    :param s: Latex string code
    :return: Tuple if found environment ``(env_name, a, b, c, d, parent_env_name, env_depth, env_item_depth)``
    """

    def _env_common(e: str) -> str:
        """
        Return the common environment for a given name.

        :param e: Environment name
        :return: Common environment
        """
        if ('itemize' in e) or ('enumerate' in e) or ('tablenotes' in e):
            return 'item_'
        return ''

    tags = _find_tex_env_recursive(s, s)
    envs = []
    env: Dict[str, List[Tuple[int, int, str, int]]] = {}
    last_env = ''
    env_depth = 0
    cmds_cont = []
    env_depths: Dict[str, int] = {}

    for t in tags:
        a, b, c, d, _ = t
        if 'begin' in s[a:b + 1]:
            env_name = s[c:d + 1]
            c_env_name = _env_common(env_name)  # Common environment name
            if c_env_name not in env_depths.keys():
                env_depths[c_env_name] = 0
            else:
                env_depths[c_env_name] += 1
            env_i = (a, d + 2, last_env, env_depth)
            if env_name not in env:
                env[env_name] = [env_i]
            else:
                env[env_name].append(env_i)
            if a not in cmds_cont:
                cmds_cont.append(a)
                last_env = env_name
                env_depth += 1
        elif 'end' in s[a:b + 1]:
            env_name = s[c:d + 1]
            c_env_name = _env_common(env_name)  # Common environment name

            if env_name in env.keys():
                env_i = env[env_name].pop()

                # Update env itemize depth
                env_depth_item = -1
                if c_env_name != '':
                    env_depth_item = env_depths[c_env_name]
                    env_depths[c_env_name] -= 1

                envs.append((
                    env_name,  # Environment name
                    env_i[0],  # a-position of the env
                    env_i[1],  # b-position
                    a,  # c-position
                    d,  # d-position
                    env_i[2],  # parent environment name
                    env_i[3],  # depth of the environment
                    env_depth_item  # itemize depth
                ))

                if len(env[env_name]) == 0:
                    del env[env_name]
                last_env = env_i[2]
                env_depth -= 1

    return tuple(envs)


[docs]def get_tex_commands_args(
    s: str,
    pos: bool = False
) -> Tuple[Tuple[Union[str, Tuple[str, bool], Tuple[int, int]], ...], ...]:
    """
    Get all the arguments from a tex command. Each command argument has a boolean
    indicating if that is optional or not.

    .. code-block:: none

        Example: This is \aCommand[\label{}]{nice} and...
        Output: (('aCommand', ('\label{}', True), ('nice', False)), ...)

    :param s: Latex string code
    :param pos: Add the numerical position of the original string at the last position
    :return: Arguments
    """
    tags = find_tex_commands(s)
    commands = []
    command = []
    for t in tags:
        a, b, c, d, cont = t
        if len(command) == 0:
            command.append(s[a + 1:b + 1].strip())
        arg = s[c - 1:d + 2]
        command.append((arg[1:-1], len(arg) != 0 and arg[0] == '['))
        if not cont:
            if pos:
                command.append((a, d + 2))
            commands.append(tuple(command))
            command = []
    return tuple(commands)


[docs]def find_tex_commands_noargv(s: str) -> Tuple[Tuple[int, int], ...]:
    """
    Find all tex commands with no arguments within a code.

    .. code-block:: none

                 00000000001111111111222
                 01234567890123456789012
                         x       x
        Example: This is \aCommand ...
        Output: ((8,16), ...)

    :param s: Latex string code
    :return: Tuple if found codes
    """
    found = []
    is_cmd = False
    s += '_'
    a = 0
    cont_chars = ('{', '[', ' ')

    for i in range(len(s) - 1):
        if not is_cmd and s[i] == '\\' and s[i + 1] in TEX_COMMAND_CHARS:
            if i > 0 and s[i - 1] == '⇲':
                continue
            a = i
            is_cmd = True

        elif is_cmd and s[i] == '\\':
            if i - 1 - a > 0:
                found.append([a, i - 1])
            a = i

        elif is_cmd and s[i] in ('{', '['):
            is_cmd = False

        # If command, not arg, but an invalid char follows the space, disables the command
        elif is_cmd and s[i - 1] == ' ' and s[i] not in cont_chars:
            is_cmd = False
            found.append([a, i - 1])

        elif is_cmd and s[i] not in TEX_COMMAND_CHARS and s[i] not in cont_chars:
            is_cmd = False
            found.append([a, i - 1])

    if is_cmd and a != len(s) - 2:
        found.append([a, len(s) - 2])

    # Strip chars
    for k in range(len(found)):
        ch = found[k][1]
        for j in range(ch):
            if s[found[k][1]] == ' ':
                found[k][1] -= 1
            else:
                break
        # noinspection PyUnresolvedReferences
        found[k] = tuple(found[k])

    # noinspection PyTypeChecker
    return tuple(found)


[docs]def apply_tag_tex_commands(
    s: str,
    tags: Union[Tuple[str, str, str, str, str], str]
) -> str:
    """
    Apply tag to tex command.

    For example, if tag is ``[1,2,3,4,5]``:

    .. code-block:: none

        Input: This is a \\formula{epic} and this is not
        Output: This is a 1\\formula2{3epic4}5 and this is not

    :param s: Latex string code
    :param tags: Tags (length 5)
    :return: Code with tags
    """
    if isinstance(tags, str):
        if tags == '':
            return s
        tags = (tags, tags, tags, tags, tags)
    assert len(tags) == 5
    a, b, c, d, e = tags  # Unpack

    tex_tags = find_tex_commands(s)
    if len(tex_tags) == 0:
        return s
    new_s = ''
    k = 0  # Moves through tags
    i = -1
    for _ in range(len(s)):
        i += 1
        if i == len(s):
            break
        if k < len(tex_tags) and i in tex_tags[k][0:4]:
            if i == tex_tags[k][0]:
                new_s += a + s[i]
            elif i == tex_tags[k][1]:
                new_s += s[i] + b
            elif i == tex_tags[k][2] and i != tex_tags[k][3]:
                new_s += c + s[i]
            elif i == tex_tags[k][3]:
                if i == tex_tags[k][2]:
                    new_s += c
                new_s += s[i] + d + s[i + 1] + e
                i += 1
                # if continues
                if tex_tags[k][4]:
                    new_s += b
                k += 1
        else:
            new_s += s[i]

    return new_s[0:len(new_s)]


[docs]def apply_tag_tex_commands_no_argv(
    s: str,
    tags: Union[Tuple[str, str], str]
) -> str:
    """
    Apply tag to tex command.

    For example, if tag is ``[1,2]``:

    .. code-block:: none

        Input: This is a \\formula and this is not.
        Output: This is a 1\\formula2 and this is not

    :param s: Latex string code
    :param tags: Tags (length 5)
    :return: Code with tags
    """
    if isinstance(tags, str):
        if tags == '':
            return s
        tags = (tags, tags)
    assert len(tags) == 2
    a, b = tags  # Unpack

    tex_tags = find_tex_commands_noargv(s)
    if len(tex_tags) == 0:
        return s
    new_s = ''
    k = 0  # Moves through tags
    i = -1
    for _ in range(len(s)):
        i += 1
        if k < len(tex_tags) and i in tex_tags[k]:
            if i == tex_tags[k][0]:
                new_s += a + s[i]
            elif i == tex_tags[k][1]:
                new_s += s[i] + b
                k += 1
        else:
            new_s += s[i]

    return new_s


def _convert_single_symbol(s: str) -> Optional[str]:
    """
    If ``s`` is just a latex code ``'alpha'`` or ``'beta'`` it converts it to its
    unicode representation.

    :param s: Latex string code
    :return: Latex with converted single symbols
    """
    if '\\' not in s[0]:
        s = '\\' + s
    for (code, val) in _TEX_TO_UNICODE['latex_symbols']:
        if code == s:
            return val
    return None


def _convert_latex_symbols(s: str) -> str:
    """
    Replace each ``'\alpha'``, ``'\beta'`` and similar latex symbols with
    their unicode representation.

    :param s: Latex string code
    :return: Replaced symbols
    """
    for (code, val) in _TEX_TO_UNICODE['latex_symbols']:
        s = s.replace(code, val)
    return s


def _process_starting_modifiers(s: str) -> str:
    """
    If s start with ``'it '``, ``'cal '``, etc. then make the whole string
    italic, calligraphic, etc.

    :param s: Latex string code
    :return: Modified text
    """
    s = re.sub('^bb ', r'\\bb{', s)
    s = re.sub('^bf ', r'\\bf{', s)
    s = re.sub('^it ', r'\\it{', s)
    s = re.sub('^cal ', r'\\cal{', s)
    s = re.sub('^frak ', r'\\frak{', s)
    s = re.sub('^mono ', r'\\mono{', s)
    return s


def _apply_all_modifiers(s: str) -> str:
    """
    Applies all modifiers.

    :param s: Latex string code
    :return: Text with replaced chars
    """
    s = _apply_modifier(s, '^', _TEX_TO_UNICODE['superscripts'])
    s = _apply_modifier(s, '_', _TEX_TO_UNICODE['subscripts'])

    s = _apply_modifier(s, '\\bb', _TEX_TO_UNICODE['textbb'])
    s = _apply_modifier(s, '\\bf', _TEX_TO_UNICODE['textbf'])
    s = _apply_modifier(s, '\\cal', _TEX_TO_UNICODE['textcal'])
    s = _apply_modifier(s, '\\emph', _TEX_TO_UNICODE['textit'])
    s = _apply_modifier(s, '\\frak', _TEX_TO_UNICODE['textfrak'])
    s = _apply_modifier(s, '\\it', _TEX_TO_UNICODE['textit'])
    s = _apply_modifier(s, '\\mono', _TEX_TO_UNICODE['textmono'])

    return s


def _apply_modifier(s: str, modifier: str, d: Dict[Any, str]) -> str:
    """
    This will search for the ^ signs and replace the next
    digit or (digits when {} is used) with its/their uppercase representation.

    :param s: Latex string code
    :param modifier: Modifier command
    :param d: Dict to look upon
    :return: New text with replaced text.
    """
    s = s.replace(modifier, "^")
    newtext = ""
    mode_normal, mode_modified, mode_long = range(3)
    mode = mode_normal
    for ch in s:
        if mode == mode_normal and ch == '^':
            mode = mode_modified
            continue
        elif mode == mode_modified and ch == '{':
            mode = mode_long
            continue
        elif mode == mode_modified:
            newtext += d.get(ch, ch)
            mode = mode_normal
            continue
        elif mode == mode_long and ch == '}':
            mode = mode_normal
            continue

        if mode == mode_normal:
            newtext += ch
        else:
            newtext += d.get(ch, ch)
    return newtext


def __load_unicode() -> None:
    """
    Loads the unicode data.
    """
    respath = str(os.path.abspath(os.path.dirname(__file__))).replace('\\', '/') + '/res/u_'
    for j in _TEX_TO_UNICODE.keys():
        if j == 'latex_symbols':
            with open(f'{respath}symbols.txt', encoding='utf-8') as f:
                line = f.readline()
                while line != "":
                    words = line.split()
                    code = words[0]
                    val = words[1]
                    _TEX_TO_UNICODE['latex_symbols'].append((code, val))
                    line = f.readline()
        else:
            with open(f'{respath}{j}.txt', encoding='utf-8') as f:
                line = f.readline()
                while line != '':
                    words = line.split()
                    code = words[0]
                    val = words[1]
                    _TEX_TO_UNICODE[j][code] = val
                    line = f.readline()


[docs]def tex_to_unicode(s: str) -> str:
    """
    Transforms tex code to unicode.

    :param s: Latex string code
    :return: Text in unicode
    """
    if s.strip() == '':
        return s
    ss = _convert_single_symbol(s)
    if ss is not None:
        return ss

    s = _convert_latex_symbols(s)
    s = _process_starting_modifiers(s)
    s = _apply_all_modifiers(s)

    # Last filter
    s = s.replace('\n\n', '\n').replace('  ', ' ').replace('\t', ' ')
    try:
        s = _FLATLATEX.convert(s)
    except LatexSyntaxError:
        pass

    return s


# Loads the unicode data
__load_unicode()