Source code for pydetex.parsers

"""
PyDetex
https://github.com/ppizarror/PyDetex

PARSERS
Defines parsers, which perform a single task for removal of LaTex things.
"""

__all__ = [
    'find_str',
    'FONT_FORMAT_SETTINGS',
    'process_begin_document',
    'process_chars_equations',
    'process_cite',
    'process_citeauthor',
    'process_def',
    'process_inputs',
    'process_items',
    'process_labels',
    'process_ref',
    'remove_commands_char',
    'remove_commands_param',
    'remove_commands_param_noargv',
    'remove_comments',
    'remove_common_tags',
    'remove_environments',
    'remove_equations',
    'remove_tag',
    'replace_pydetex_tags',
    'simple_replace',
    'strip_punctuation',
    'unicode_chars_equations'
]

import os
import pydetex.utils as ut

from pydetex._symbols import *
from typing import List, Tuple, Union, Optional, Callable

# Files
_LAST_NOT_FOUND_FILES_PATH = [os.getcwd()]
_NOT_FOUND_FILES = []
_PRINT_LOCATION = False

# Tags
_TAG_BRACE_CLOSE = '⇱BRACE_CLOSE⇲'
_TAG_BRACE_OPEN = '⇱BRACE_OPEN⇲'
_TAG_CLOSE_CITE = '⇱CLOSE_CITE⇲'
_TAG_CLOSE_CITE_EQN = '⇱CLOSE_CITE_EQN⇲'
_TAG_DOLLAR_SYMBOL = '⇱DOLLAR_SYMBOL⇲'
_TAG_FILE_ERROR = '⇱FILE_ERROR⇲'
_TAG_ITEM_SPACE = '⇱ITEM_SPACE⇲'
_TAG_NEW_LINE = '⇱NEW_LINE⇲'
_TAG_OPEN_CITE = '⇱OPEN_CITE⇲'
_TAG_OPEN_CITE_EQN = '⇱OPEN_CITE_EQN⇲'
_TAG_PERCENTAGE_SYMBOL = '⇱COMMENT_PERCENTAGE_SYMBOL⇲'

# Others
_ROMAN_DIGITS = [
    (1000, 'M'),
    (900, 'CM'),
    (500, 'D'),
    (400, 'CD'),
    (100, 'C'),
    (90, 'XC'),
    (50, 'L'),
    (40, 'XL'),
    (10, 'X'),
    (9, 'IX'),
    (5, 'V'),
    (4, 'IV'),
    (1, 'I')
]

# Stores the learned definitions
_DEFS = {}

# Parser font format. This dict stores the font of some tex elements to be represented
# in the GUI text editor. The values are the same of _fonts.FONT_TAGS. By default,
# they are empty, and are updated in the PyDetexGUI._process() method
FONT_FORMAT_SETTINGS = {
    'bold': '',
    'cite': '',
    'equation': '',
    'hl': '',
    'italic': '',
    'normal': '',
    'ref': '',
    'strike': '',
    'tex_text_tag': '',
    'tex_text_tag_content': '',
    'underline': ''
}

LANG_TT_TAGS = ut.LangTexTextTags()


def _find_str(s: str, char: str) -> int:
    """
    Finds a sequence within a string, and returns the position. If not exists, returns ``-1``.

    :param s: Latex string code
    :param char: Sequence
    :return: Position
    """
    index = 0

    if char in s:
        c = char[0]
        for ch in s:
            if ch == c:
                if s[index:index + len(char)] == char:
                    return index

            index += 1

    return -1


def _os_listfolder() -> List[str]:
    """
    Returns the folders from the current path.

    :return: Folder list
    """
    dirs = os.listdir('./')
    folders = []
    for k in dirs:
        if os.path.isdir(k):
            folders.append(k + '/')
    return folders


def _load_file(f: str, path: str) -> str:
    """
    Try to load a file.

    :param f: Filename
    :param path: Path to look from
    :return: File contents
    """
    try:
        return ut.open_file(path + f)
    except FileNotFoundError:
        return _TAG_FILE_ERROR


[docs]def find_str(s: str, char: Union[str, List[str], Tuple[str, ...]]) -> int:
    """
    Finds a sequence within a string, and returns the position. If not exists, returns ``-1``.

    :param s: Latex string code
    :param char: Sequence or List of sequence
    :return: Position
    """
    if isinstance(char, str):
        return _find_str(s, char)
    else:
        for ch in char:
            j = _find_str(s, ch)
            if j != -1:
                return j
    return -1


[docs]def remove_tag(s: str, tagname: str) -> str:
    """
    Removes a latex tag code.

    :param s: Latex string code
    :param tagname: Tag code
    :return: String without tags
    """
    tagname = '\\' + tagname
    tagadd = 1
    if '{' not in tagname:
        tagname += '{'
        tagadd = 0
    while True:
        k = find_str(s, tagname)
        if k == -1:  # No more tags, return
            return s
        deep = 0
        f = False
        for j in range(len(s)):
            if s[k + j] == '{':
                deep += 1
                f = True
                continue
            if s[k + j] == '}':
                deep -= 1
            if deep == 0 and f:
                # update s
                s = s[:k] + s[k + len(tagname) + tagadd:k + j] + s[k + j + 1:]
                break


[docs]def remove_common_tags(
    s: str,
    replace_tags: Optional[List] = None,
    **kwargs
) -> str:
    """
    Remove common tags from string.

    :param s: Latex string code
    :param replace_tags: List to replace. If ``None``, default will be used
    :return: Text without tags
    """
    if replace_tags is None:
        replace_tags = [
            'chapter',
            'emph',
            'emph',
            'hl',
            'section',
            'subsection',
            'subsubsection',
            'subsubsubsection',
            'textbf',
            'textit',
            'textsuperscript',
            'texttt'
        ]

    for tag in replace_tags:
        s = remove_tag(s, tag)

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Removing common tags')
    return s


[docs]def process_cite(
    s: str,
    sort_cites: bool = True,
    compress_cite: bool = True,
    cite_separator: str = ', ',
    **kwargs
) -> str:
    r"""
    Transforms all cites to a text-based with numbers. For example,
    ``'This is from \cite{Pizarro}'`` to ``'This is from [1]'``.

    :param s: Latex string code
    :param sort_cites: Sorts the cite numbers
    :param compress_cite: Compress the cite numbers, ex ``[1, 2, 3, 10]`` to ``[1-3, 10]``
    :param cite_separator: Separator of cites, for example ``[1{sep}2{sep}3]``
    :return: Latex with cite as numbers
    """
    assert isinstance(cite_separator, str)
    cites = {}
    look = ['\\cite*{', '\\citet*{', '\\citep*{', '\\cite{', '\\citet{', '\\citep{',
            '\\newcite{', '\\newcite*{', '\\cite* {', '\\citet* {', '\\citep* {',
            '\\cite {', '\\citet {', '\\citep {', '\\newcite {', '\\newcite* {']
    look_eqn = ['\\eqref{']
    look += look_eqn
    k = -1
    while True:
        run_j = ''
        for j in look.copy():
            k = find_str(s, j)
            if k == -1:
                look.remove(j)
            else:
                run_j = j
                break
        if k == -1:
            if kwargs.get('pb'):  # Update progressbar
                kwargs.get('pb').update('Processing cites')
            return s
        for j in range(len(s)):
            if s[k + j] == '}':
                c = s[k + len(run_j):k + j]

                # Create the number of the cites
                cite_nums: List[int] = []
                for w in c.split(','):
                    w = w.strip()
                    if w not in cites.keys():
                        cites[w] = len(cites.keys()) + 1
                    cite_nums.append(cites[w])
                    c = c.replace(w, str(cites[w]))

                # Sort the cites
                if sort_cites:
                    cite_nums.sort()

                new_cites: List[str] = []

                # Compress
                if compress_cite:
                    cont = False  # Cite number continues
                    prev_c = -1  # Previous cite
                    compr_range = -1  # First compress
                    for w in cite_nums:
                        if w - prev_c != 1 or w == cite_nums[-1]:
                            if cont:
                                # Find if the first is present in the list
                                for m in range(len(new_cites)):
                                    if new_cites[m] == str(compr_range):
                                        new_cites.pop(m)
                                        break
                                new_cites.append(f'{compr_range}-{w}')
                            else:
                                new_cites.append(str(w))
                            cont = False
                            compr_range = w
                        else:
                            cont = True
                        prev_c = w

                else:
                    for w in cite_nums:
                        new_cites.append(str(w))

                c = cite_separator.join(new_cites)
                eqn_mode = run_j in look_eqn
                open_cite = _TAG_OPEN_CITE if not eqn_mode else _TAG_OPEN_CITE_EQN
                close_cite = _TAG_CLOSE_CITE if not eqn_mode else _TAG_CLOSE_CITE_EQN
                s = s[:k] + FONT_FORMAT_SETTINGS['cite'] + open_cite + c + \
                    close_cite + FONT_FORMAT_SETTINGS['normal'] + s[k + j + 1:]
                break


[docs]def process_citeauthor(
    s: str,
    lang: str,
    **kwargs
) -> str:
    r"""
    Transforms all citeauthor to [cite]. For example:
    ``'This is from \citeauthor{Pizarro}, and that is from \citeauthor{cite1, cite2}'`` to
    ``'This is from [author], and that is from [authors]'``.

    :param s: Latex string code
    :param lang: Language tag of the code
    :return: Latex with replaced cites
    """
    look = ['\\citeauthor{']
    k = -1
    while True:
        run_j = ''
        for j in look.copy():
            k = find_str(s, j)
            if k == -1:
                look.remove(j)
            else:
                run_j = j
                break
        if k == -1:
            if kwargs.get('pb'):  # Update progressbar
                kwargs.get('pb').update('Processing citeauthor')
            return s
        for j in range(len(s)):
            if s[k + j] == '}':
                c = s[k + len(run_j):k + j].split(',')

                # Count the number of cites
                c = LANG_TT_TAGS.get(lang, 'citeauthor_single' if len(c) == 1 else 'citeauthor_multiple')

                # Write cite
                s = s[:k] + FONT_FORMAT_SETTINGS['cite'] + _TAG_OPEN_CITE + c + \
                    _TAG_CLOSE_CITE + FONT_FORMAT_SETTINGS['normal'] + s[k + j + 1:]
                break


[docs]def replace_pydetex_tags(
    s: str,
    cite_format: Tuple[str, str] = ('[', ']'),
    **kwargs
) -> str:
    """
    Replaces tags to text.

    :param s: Latex string code
    :param cite_format: Cite format
    :return: String with no cites
    """
    assert len(cite_format) == 2
    s = s.replace(_TAG_OPEN_CITE, (cite_format[0]))
    s = s.replace(_TAG_OPEN_CITE_EQN, '(')
    s = s.replace(_TAG_CLOSE_CITE, (cite_format[1]))
    s = s.replace(_TAG_CLOSE_CITE_EQN, ')')
    s = s.replace(_TAG_ITEM_SPACE, ' ')
    s = s.replace(_TAG_PERCENTAGE_SYMBOL, '%')
    s = s.replace(_TAG_BRACE_OPEN, '{')
    s = s.replace(_TAG_BRACE_CLOSE, '}')
    s = s.replace(_TAG_NEW_LINE, '\n')
    if kwargs.get('replace_pydetex_tag_dollar_symbol', True):
        s = s.replace(_TAG_DOLLAR_SYMBOL, '$')
    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Replacing pydetex tags')
    return s


[docs]def process_labels(s: str, **kwargs) -> str:
    """
    Removes labels.

    :param s: Latex string code
    :return: String with no labels
    """
    while True:
        k = find_str(s, '\\label{')
        if k == -1:
            if kwargs.get('pb'):  # Update progressbar
                kwargs.get('pb').update('Processing labels')
            return s
        for j in range(len(s)):
            if s[k + j] == '}':
                s = s[:k] + s[k + j + 1:]
                break


[docs]def process_ref(s: str, **kwargs) -> str:
    """
    Process references, same as cites, replace by numbers.

    :param s: Latex string code
    :return: String with numbers instead of references.
    """
    look = ['\\ref{', '\\ref*{', '\\autoref{']
    refs = []
    k = -1
    while True:
        run_j = ''
        for j in look.copy():
            k = find_str(s, j)
            if k == -1:
                look.remove(j)
            else:
                run_j = j
                break
        if k == -1:
            if kwargs.get('pb'):  # Update progressbar
                kwargs.get('pb').update('Processing references')
            return s
        for j in range(len(s)):
            if s[k + j] == '}':
                ref_label = s[k + len(run_j):k + j].strip()
                if ref_label not in refs:
                    refs.append(ref_label)
                ref_idx = refs.index(ref_label) + 1
                s = s[:k] + FONT_FORMAT_SETTINGS['ref'] + str(ref_idx) + FONT_FORMAT_SETTINGS['normal'] + s[k + j + 1:]
                break


[docs]def remove_comments(s: str, **kwargs) -> str:
    """
    Remove comments from the text.

    :param s: Latex string code
    :return: String without comments
    """
    newline_symbol = '⇱NEWLINE_SYMBOL_REMOVE_COMMENTS⇲'
    s = s.replace('  ', ' ')
    s = s.replace('\\\\', newline_symbol)
    s = s.replace('\\%', _TAG_PERCENTAGE_SYMBOL)
    s = s.replace('\\\n', '\n')
    # s = s.replace('%\n', '')
    k = s.split('\n')

    for r in range(len(k)):
        k[r] = k[r].strip()  # Strips all text
    line_merge: List[bool] = []
    for r in range(len(k)):
        sp = k[r].split('%')
        k[r] = sp[0]  # Removes all comments from the list
        line_merge.append(len(sp) > 1)
    line_merge.append(False)
    line_merge2: List[bool] = line_merge.copy()
    k.append('')
    for r in range(len(k)):
        if line_merge[r] and not line_merge[r + 1] and k[r + 1] != '':
            line_merge2[r + 1] = True
    for r in range(len(k)):
        line_merge[r] = line_merge[r] or line_merge2[r]
    new_k = []
    j = 0
    merged_str = ''

    while True:  # Merge comment lines
        if not line_merge[j]:
            if merged_str != '':  # Add current merged str
                new_k.append(merged_str)
                merged_str = ''
            new_k.append(k[j])
        else:
            merged_str += k[j]
        if j == len(k) - 1:
            break
        j += 1
    if merged_str != '':
        new_k.append(merged_str)
    k = new_k

    w = []  # Remove duplicates '' lines to single ''
    last = ''
    for j in k:
        if j == '' and j == last:
            pass
        else:
            w.append(j)
        last = j

    if len(w) > 0 and w[-1] == '':  # Removes last space
        w.pop()
    s = '\n'.join(w).strip()
    s = s.replace(newline_symbol, '\\\\')

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Removing comments')
    return s


[docs]def simple_replace(s: str, **kwargs) -> str:
    """
    Replace simple tokens.

    :param s: Latex string code
    :return: String with replaced items
    """
    for w in REPLACE_SYMBOLS_LIBRARY:
        s = s.replace(w[0], w[1])

    # Replace unique symbols
    s += ' '
    invalid_tag = '⇱SYMBOL_REPLACE_TAG_TOKEN⇲'
    for w in REPLACE_TEX_COMMANDS_LIBRARY:
        word, repl = w
        while True:
            k = s.find(word)
            if k == -1:
                break
            if s[k + len(word)] not in ut.TEX_COMMAND_CHARS:
                s = s[0:k] + repl + s[k + len(word):]
            else:
                s = s[0:k + 1] + invalid_tag + s[k + 1:]  # format ...\\INVALID_TAG...
    s = s[0:len(s) - 1].replace(invalid_tag, '')

    # Replace equation symbols
    s = s.replace(r'\$', _TAG_DOLLAR_SYMBOL)
    tex_tags = ut.find_tex_command_char(s, ut.TEX_EQUATION_CHARS)
    new_s = ''
    k = 0  # Moves through tags
    added_s = False
    for i in range(len(s)):
        if k < len(tex_tags):
            if i < tex_tags[k][1]:
                new_s += s[i]
            elif tex_tags[k][1] <= i < tex_tags[k][2] and not added_s or tex_tags[k][1] == i == tex_tags[k][2]:
                if not added_s:
                    k_s: str = s[tex_tags[k][1]:tex_tags[k][2] + 1]
                    # Replace
                    for j in REPLACE_EQUATION_SYMBOLS_LIBRARY:
                        k_s = k_s.replace(j[0], j[1])
                    new_s += k_s
                added_s = True
            elif tex_tags[k][2] < i < tex_tags[k][3]:
                new_s += s[i]
            elif i == tex_tags[k][3]:  # Advance to another tag
                new_s += s[i]
                k += 1
                added_s = False
        else:
            new_s += s[i]

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Replacing simple tokens')
    return new_s


def _load_file_search(tex_file: str, print_error: bool = False) -> str:
    """
    Search and load a file.

    :param tex_file: Name of the file
    :param print_error: Prints if file not found
    :return: Loaded file or tag error
    """
    tx = _TAG_FILE_ERROR
    folders = _os_listfolder()
    folders.insert(0, '../')
    folders.insert(0, './')
    for f in folders:
        tx = _load_file(tex_file, f)
        if tx == _TAG_FILE_ERROR:
            if print_error:
                print(f'\tFile not found in {f}')
        else:
            break
    return tx


[docs]def process_inputs(
    s: str,
    clear_not_found_files: bool = False,
    **kwargs
) -> str:
    """
    Process inputs, which find the input files and retrieve its contents.

    :param s: Latex string code with inputs
    :param clear_not_found_files: Clear the not found files. Used when changing the path
    :return: Text copied with data from inputs
    """
    global _PRINT_LOCATION, _NOT_FOUND_FILES
    if os.getcwd() != _LAST_NOT_FOUND_FILES_PATH[0] or clear_not_found_files:
        _LAST_NOT_FOUND_FILES_PATH[0] = os.getcwd()
        _NOT_FOUND_FILES.clear()
        _PRINT_LOCATION = False
    print_ = kwargs.get('print', True)
    symbol = '⇱INPUT_FILE_TAG⇲'
    s = remove_comments(s)
    while True:
        k = find_str(s, '\\input{')
        if k == -1:
            if kwargs.get('pb'):  # Update progressbar
                kwargs.get('pb').update('Processing \\input')
            return s.replace(symbol, '\\input{')
        m = 0
        for j in range(len(s)):
            if s[k + j] == '{':
                m = j
            if s[k + j] == '}':
                tex_file = s[k + m + 1:k + j]
                if '.tex' not in tex_file:
                    tex_file += '.tex'
                if tex_file not in _NOT_FOUND_FILES and r'\jobname' not in tex_file:
                    if not _PRINT_LOCATION:
                        if print_:
                            print(f'Current path location:\n\t{os.getcwd()}')
                        _PRINT_LOCATION = True
                    if print_:
                        print(f'Detected file {tex_file}:')
                    tx = _load_file_search(tex_file, print_error=print_)
                    if tx == _TAG_FILE_ERROR:
                        _NOT_FOUND_FILES.append(tex_file)
                        s = s[:k] + symbol + s[k + m + 1:]
                    else:
                        if print_:
                            print('\tFile found and loaded')
                        tx = '\n'.join(tx.splitlines())
                        tx = remove_comments(tx)
                        s = s[:k] + tx + s[k + j + 1:]
                else:
                    s = s[:k] + symbol + s[k + m + 1:]
                break


[docs]def remove_commands_char(s: str, chars: List[Tuple[str, str, bool]]) -> str:
    """
    Remove all char commands.

    :param s: Latex string code
    :param chars: Char that define equations [(initial, final, ignore escape), ...]
    :return: Code with removed chars
    """
    tex_tags = ut.find_tex_command_char(s, symbols_char=chars)
    if len(tex_tags) == 0:
        return s
    new_s = ''
    k = 0  # Moves through tags

    for i in range(len(s)):
        if k < len(tex_tags):
            if i < tex_tags[k][0]:
                new_s += s[i]
            # elif tex_tags[k][0] <= i < tex_tags[k][3]:
            #     pass
            elif i == tex_tags[k][3]:  # Advance to other tag
                k += 1
        else:
            new_s += s[i]

    return new_s


[docs]def remove_equations(s: str, **kwargs) -> str:
    """
    Remove all equations from a string.

    :param s: Latex string code
    :return: Latex without equation
    """
    s = remove_commands_char(s, chars=ut.TEX_EQUATION_CHARS)
    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Removing equations')
    return s


def output_text_for_some_commands(
    s: str,
    lang: str
) -> str:
    """
    Replaces the command for a particular text.

    :param s: Latex string code
    :param lang: Language tag of the code
    :return: Text string or empty if error
    """
    # Stores the commands to be transformed
    # (
    #   command name,
    #   [(argument number, argument is optional), ...],
    #   tag to be replaced,
    #   total commands,
    #   font_tag ('tex_text_tag' if None),
    #   font_content ('tex_text_tag_content' if None),
    #   add new line (before, after)
    # )
    # The font format is like .... [font tag]YOUR TAG {[font content]YOUR CONTENT} ...[font normal]. In that case, tag to be
    # relaced is 'YOUR TAG {0}, {1}
    # All *arguments will be formatted using the tag
    commands: List[Tuple[
        str, List[Union[int, Tuple[int, bool]]], Union[str, Callable[[str, ...], str]], Optional[str], Optional[
            str],  # type: ignore
        Tuple[bool, bool]]] = [
        ('ac', [1], '{0}', 'normal', 'normal', (False, False)),  # Acronym
        ('acf', [1], '{0}', 'normal', 'normal', (False, False)),  # Acronym
        ('acl', [1], '{0}', 'normal', 'normal', (False, False)),  # Acronym
        ('acs', [1], '{0}', 'normal', 'normal', (False, False)),  # Acronym
        ('cancel', [1], '{0}', 'normal', 'strike', (False, False)),
        ('caption', [1], LANG_TT_TAGS.get(lang, 'caption'), None, None, (False, True)),
        ('chapter', [1], '{0}', 'normal', 'bold', (True, True)),
        ('chapter*', [1], '{0}', 'normal', 'bold', (True, True)),
        ('doublequotes', [1], lambda t: '"{0}"'.format(t), 'normal', 'normal', (False, False)),
        ('em', [1], '{0}', 'normal', 'bold', (False, False)),
        ('emph', [1], '{0}', 'normal', 'italic', (False, False)),
        ('enquote', [1], lambda t: '"{0}"'.format(t), 'normal', 'normal', (False, False)),
        ('frac', [1, 2], '{0}/{1}', 'normal', 'normal', (False, False)),
        ('hl', [1], '{0}', 'normal', 'hl', (False, False)),
        ('href', [2], LANG_TT_TAGS.get(lang, 'link'), None, None, (False, False)),
        ('insertimage', [3], LANG_TT_TAGS.get(lang, 'figure_caption'), None, None, (False, True)),  # (Template) Informe
        ('insertimage', [4], LANG_TT_TAGS.get(lang, 'figure_caption'), None, None, (False, False)),
        # (Template) Informe
        ('insertimageboxed', [4], LANG_TT_TAGS.get(lang, 'figure_caption'), None, None, (False, True)),
        # (Template) Informe
        ('insertimageboxed', [5], LANG_TT_TAGS.get(lang, 'figure_caption'), None, None, (False, True)),
        # (Template) Informe
        ('institutionentry', [1, 2, 3, 4], '{0} ({1}-{2}). {3}', 'normal', 'normal', (False, False)),
        # (Template) Professional-CV
        ('institutionentrynodate', [1, 2], '{0}. {3}', 'normal', 'normal', (False, False)),
        # (Template) Professional-CV
        ('lowercase', [1], lambda t: t.lower(), 'normal', 'normal', (False, False)),
        ('MakeLowercase', [1], lambda t: t.lower(), 'normal', 'normal', (False, False)),
        ('MakeUppercase', [1], lambda t: t.upper(), 'normal', 'normal', (False, False)),
        ('otherentry', [1, 2], '{0} {1}', 'normal', 'normal', (False, False)),  # (Template) Professional-CV
        ('paragraph', [1], '{0}', 'normal', 'bold', (True, True)),
        ('quotes', [1], lambda t: '"{0}"'.format(t), 'normal', 'normal', (False, False)),
        ('section', [1], '{0}', 'normal', 'bold', (True, True)),
        ('section*', [1], '{0}', 'normal', 'bold', (True, True)),
        ('so', [1], '{0}', 'normal', 'normal', (False, False)),
        ('sout', [1], '{0}', 'normal', 'strike', (False, False)),
        ('st', [1], '{0}', 'normal', 'strike', (False, False)),
        ('subfloat', [(1, True)], LANG_TT_TAGS.get(lang, 'sub_figure_title'), None, None, (False, True)),
        ('subparagraph', [1], '{0}', 'normal', 'bold', (True, True)),
        ('subsection', [1], '{0}', 'normal', 'bold', (True, True)),
        ('subsection*', [1], '{0}', 'normal', 'bold', (True, True)),
        ('subsubsection', [1], '{0}', 'normal', 'bold', (True, True)),
        ('subsubsection*', [1], '{0}', 'normal', 'bold', (True, True)),
        ('subsubsubsection', [1], '{0}', 'normal', 'bold', (True, True)),
        ('subsubsubsection*', [1], '{0}', 'normal', 'bold', (True, True)),
        ('text', [1], '{0}', 'normal', 'normal', (False, False)),
        ('textbf', [1], '{0}', 'normal', 'bold', (False, False)),
        ('textit', [1], '{0}', 'normal', 'italic', (False, False)),
        ('texttt', [1], '{0}', 'normal', 'normal', (False, False)),
        ('underline', [1], '{0}', 'normal', 'underline', (False, False)),
        ('uppercase', [1], lambda t: t.upper(), 'normal', 'normal', (False, False))
    ]
    new_s = ''

    # Get the commands
    for c in ut.get_tex_commands_args(s):
        for cmd in commands:
            if c[0] == cmd[0]:
                _, cmd_args, cmd_tag, font_tag, font_content, cmd_newline = cmd
                total_arguments = len(cmd_args)
                for cc in cmd_args:
                    total_arguments = max(cc[0] if isinstance(cc, tuple) else cc, total_arguments)
                if font_tag is None:
                    font_tag = 'tex_text_tag'
                if font_content is None:
                    font_content = 'tex_text_tag_content'
                if len(c) - 1 == total_arguments:
                    args = []
                    for j in cmd_args:
                        if isinstance(j, tuple):
                            cmd_argnum, cmd_is_optional = j
                        else:
                            cmd_argnum, cmd_is_optional = (j, False)
                        if len(c) - 1 >= cmd_argnum >= 0 and c[cmd_argnum][1] == cmd_is_optional:
                            argv = c[cmd_argnum][0].replace('\n', ' ')  # Command's argument to process
                            argv = remove_commands_param(argv, lang)  # Remove commands within the argument
                            args.append(argv.strip())
                    if len(args) == len(cmd_args):
                        # Add format text
                        for a in range(len(args)):
                            args[a] = FONT_FORMAT_SETTINGS[font_content] + args[a] + FONT_FORMAT_SETTINGS[font_tag]
                        if callable(cmd_tag):
                            text = cmd_tag(*args)
                        else:
                            try:
                                text = cmd_tag.format(*args)
                            except IndexError:
                                text = cmd_tag
                        text = FONT_FORMAT_SETTINGS[font_tag] + text + FONT_FORMAT_SETTINGS['normal']
                        if cmd_newline[0]:
                            text = _TAG_NEW_LINE + text
                        new_s += text
                        if cmd_newline[1]:
                            new_s += _TAG_NEW_LINE
                        break

    return new_s.strip()


[docs]def remove_environments(
    s: str,
    env_list: Optional[List[str]] = None,
    **kwargs
) -> str:
    """
    Remove a selection of environments.

    :param s: Latex code
    :param env_list: Environment list, if not defined, use the default from PyDetex
    :return: Code without given environments
    """
    if not env_list:
        env_list = [
            'lstlisting',
            'references',
            'minted',
            'sourcecode',
            'tabular',
            'thebibiliography',
            'tikzpicture',
            'verbatim'
        ]
    tex_tags = ut.find_tex_environments(s)
    if len(tex_tags) == 0 or len(env_list) == 0:
        if kwargs.get('pb'):  # Update progressbar
            kwargs.get('pb').update('No environment found in code')
        return s
    new_s = ''

    new_tex_tags = []
    # Remove all the environments not in env_list
    for t in tex_tags:
        is_removed = False
        for j in env_list:
            if j in t[0]:
                is_removed = True
                break
        if is_removed:
            new_tex_tags.append(t)

    # If tex tags is empty
    if len(new_tex_tags) == 0:
        return s

    def is_in_tags(v: int) -> bool:
        """
        Check if a position is within tags.

        :param v: Position
        :return: True if in tags range
        """
        for j_ in new_tex_tags:
            if j_[1] <= v <= j_[4] + 1:
                return True
        return False

    for i in range(len(s)):
        if not is_in_tags(i):
            new_s += s[i]

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Removing environments')
    return new_s


[docs]def remove_commands_param(
    s: str,
    lang: str,
    invalid_commands: Optional[List[str]] = None,
    **kwargs
) -> str:
    """
    Remove all commands with params.

    :param s: Latex string code
    :param lang: Language tag of the code
    :param invalid_commands: Invalid commands that will not call output_text_for_some_commands. If ``None`` use default
    :return: Code with removed chars
    """
    tex_tags = ut.find_tex_commands(s)
    if len(tex_tags) == 0:
        if kwargs.get('pb'):  # Update progressbar
            kwargs.get('pb').update('No parameter commands found in code')
        return s
    new_s = ''
    k = 0  # Moves through tags

    # invalid commands that will not call output_text_for_some_commands
    if not invalid_commands:
        invalid_commands = [
            'DeclareUnicodeCharacter',
            'ifthenelse',
            'newcommand',
            'newenvironment',
            'usepackage'
        ]

    for i in range(len(s)):
        if k < len(tex_tags):
            if i < tex_tags[k][0]:
                new_s += s[i]
            elif i < tex_tags[k][3] + 1:
                pass
            else:  # Advance to other tag
                sub_s = s[tex_tags[k][0]:tex_tags[k][3] + 2]

                # If the command does not continue, write the text for such
                # command, if this does not continue (for example, that happens
                # when calling for \mycommand{1}{2}{3}). In that case, only tex_tags
                # [\mycommand .... {3}] will be called, thus, sub_s will contain
                # all the parameters of the command ({1}{2}{3})
                if not tex_tags[k][4]:
                    cmd_name = s[tex_tags[k][0]:tex_tags[k][1] + 1].strip()

                    # Check if the invalid_commands are not within command name
                    is_invalid = False
                    for c in invalid_commands:
                        if c in cmd_name:
                            is_invalid = True
                            break

                    # If not invalid, call the analysis for its commands, check that
                    # it can be recursive
                    if not is_invalid:
                        new_s += output_text_for_some_commands(sub_s, lang)
                k += 1
        else:
            new_s += s[i]

    # Replace all command symbols
    parenthesis_open_symbol = '⇱PARENTHESIS_OPEN_SYMBOL⇲'
    parenthesis_close_symbol = '⇱PARENTHESIS_CLOSE_SYMBOL⇲'
    parenthesis_sq_open_symbol = '⇱PARENTHESIS_SQ_OPEN_SYMBOL⇲'
    parenthesis_sq_close_symbol = '⇱PARENTHESIS_SQ_CLOSE_SYMBOL⇲'
    new_s = new_s.replace('\\{', parenthesis_open_symbol)
    new_s = new_s.replace('\\}', parenthesis_close_symbol)
    new_s = new_s.replace('\\[', parenthesis_sq_open_symbol)
    new_s = new_s.replace('\\]', parenthesis_sq_close_symbol)
    new_s = new_s.replace('{', '').replace('}', '')  # .replace('[', '').replace(']', '')
    new_s = new_s.replace(parenthesis_open_symbol, '\\{')
    new_s = new_s.replace(parenthesis_close_symbol, '\\}')
    new_s = new_s.replace(parenthesis_sq_open_symbol, '\\[')
    new_s = new_s.replace(parenthesis_sq_close_symbol, '\\]')

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Removing command with parameters')
    return new_s


[docs]def remove_commands_param_noargv(s: str, **kwargs) -> str:
    """
    Remove all commands without arguments.

    :param s: Latex string code
    :return: Code with removed chars
    """
    tex_tags = ut.find_tex_commands_noargv(s)
    if len(tex_tags) == 0:
        if kwargs.get('pb'):  # Update progressbar
            kwargs.get('pb').update('No command without arguments were found in code')
        return s
    new_s = ''
    k = 0  # Moves through tags

    for i in range(len(s)):
        if k < len(tex_tags):
            if i < tex_tags[k][0]:
                new_s += s[i]
            elif i < tex_tags[k][1]:
                pass
            else:  # Advance to other tag
                k += 1
        else:
            new_s += s[i]

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Removing command without arguments')
    return new_s


[docs]def unicode_chars_equations(s: str, **kwargs) -> str:
    """
    Converts all equations to unicode.

    :param s: Latex string code
    :return: Latex with unicode converted
    """
    tex_tags = ut.find_tex_command_char(s, ut.TEX_EQUATION_CHARS)
    new_s = ''
    k = 0  # Moves through tags
    added_s = False
    for i in range(len(s)):
        if k < len(tex_tags):
            if i < tex_tags[k][1]:
                new_s += s[i]
            elif tex_tags[k][1] <= i < tex_tags[k][2] and not added_s or tex_tags[k][1] == i == tex_tags[k][2]:
                if not added_s:
                    k_s: str = s[tex_tags[k][1]:tex_tags[k][2] + 1]
                    k_s_tex = ut.tex_to_unicode(k_s)
                    k_s_tex = k_s_tex.replace(r'\{', _TAG_BRACE_OPEN).replace(r'\}', _TAG_BRACE_CLOSE)
                    new_s += k_s_tex
                added_s = True
            elif tex_tags[k][2] < i < tex_tags[k][3]:
                new_s += s[i]
            elif i == tex_tags[k][3]:  # Advance to other tag
                new_s += s[i]
                k += 1
                added_s = False
        else:
            new_s += s[i]

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Processing unicode equations')
    return new_s


[docs]def process_chars_equations(
    s: str,
    lang: str,
    single_only: bool,
    **kwargs
) -> str:
    """
    Process single char equations, removing the symbols.

    :param s: Latex string code
    :param lang: Language tag of the code
    :param single_only: Only process single char equations. If False, replaces the equation by a text-label
    :return: Code without symbols
    """
    tex_tags = ut.find_tex_command_char(s, ut.TEX_EQUATION_CHARS)
    if len(tex_tags) == 0:
        if kwargs.get('pb'):  # Update progressbar
            kwargs.get('pb').update('No char equtions found')
        return s

    new_s = ''
    k = 0  # Moves through tags
    eqn_number = 0
    added_equ = False

    for i in range(len(s)):
        if k < len(tex_tags):
            if i < tex_tags[k][0]:
                new_s += s[i]
            # elif tex_tags[k][0] <= i < tex_tags[k][1]:
            #     continue
            elif tex_tags[k][1] <= i <= tex_tags[k][2] and not added_equ:
                equ = s[tex_tags[k][1]:tex_tags[k][2] + 1]
                if len(equ) == 1:
                    new_s += FONT_FORMAT_SETTINGS['equation'] + s[i] + FONT_FORMAT_SETTINGS['normal']
                else:
                    if not single_only:
                        new_s += FONT_FORMAT_SETTINGS['equation'] + \
                                 LANG_TT_TAGS.get(lang, 'multi_char_equ').format(eqn_number) + \
                                 FONT_FORMAT_SETTINGS['normal']
                        eqn_number += 1
                    else:
                        new_s += equ
                added_equ = True
            # elif tex_tags[k][2] < i < tex_tags[k][3]:
            #     continue
            elif tex_tags[k][3] == i:
                k += 1
                added_equ = False
                continue
        else:
            new_s += s[i]

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Processing char equations')
    return new_s


[docs]def strip_punctuation(s: str, **kwargs) -> str:
    """
    Strips punctuation. For example, ``'mycode :'`` to ``'mycode:'``.

    :param s: Latex string code
    :return: Stripped punctuation
    """
    for j in [',', ':', '=', ';', '!', '?', '.']:  # Before
        s = s.replace(f' {j}', j)
    s = s.replace('\n\n\n', '\n\n')
    s = s.strip()
    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Stripping punctuation')
    return s


[docs]def process_def(
    s: str,
    clear_learned: bool = True,
    replace: bool = False,
    **kwargs
) -> str:
    r"""
    Process \defs. Store the definition, among others.

    :param s: Latex with definitions
    :param clear_learned: Clear the last learned definitions
    :param replace: Replace instances of learned defs
    :return: Latex without definitions
    """
    if '\\def' not in s:
        if kwargs.get('pb'):  # Update progressbar
            kwargs.get('pb').update('No definitions found in code')
        return s
    if clear_learned:
        _DEFS.clear()
    s += '    '
    new_s = ''
    found_def = False
    a, b, c, depth = 0, 0, -1, -1  # Def positions (a\def      b{ .... c}
    def_ranges = []
    for i in range(len(s) - 4):
        # After finding a def, check the first and last parenthesis
        if s[i:i + 4] == '\\def' and s[i + 4] not in ut.TEX_COMMAND_CHARS:
            a, b, depth = i, -1, 0
            found_def = True
            continue
        elif found_def:
            if found_def and s[i] == '{' and s[i - 1] != '\\':
                if depth == 0:
                    b = i
                depth += 1
            if found_def and s[i] == '}' and s[i - 1] != '\\':
                depth -= 1
                if depth == 0:
                    c = i
                    def_ranges.append((a, c))

                    # Check the name, if not a command, store
                    def_name = s[a + 4:b].strip()
                    if '#' not in def_name:
                        _DEFS[def_name] = remove_common_tags(s[b + 1:c])
                    found_def = False
            continue

        else:
            new_s += s[i]

    # Now, if replace defs is enabled, check all non-arg commands and replace if
    # known
    if replace:
        new_s_def = ''
        st = ut.find_tex_commands_noargv(new_s)
        w = 0  # Iterates through st
        k = 0
        if len(st) > 0:
            for _ in range(len(new_s)):
                if k < len(new_s):
                    if k < st[w][0]:
                        new_s_def += new_s[k]
                    else:
                        a, b = st[w]
                        def_n = new_s[a:b + 1]
                        if def_n in _DEFS.keys():
                            new_s_def += _DEFS[def_n]
                        else:
                            new_s_def += new_s[a:b + 1]
                        k += b - a
                        w += 1
                        if w == len(st):
                            new_s_def += new_s[k + 1:]
                            break
                k += 1
            new_s = new_s_def

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Processing definitions')

    return new_s


[docs]def process_items(s: str, lang: str, **kwargs) -> str:
    """
    Process itemize and enumerate.

    :param s: Latex string code
    :param lang: Language tag
    :return: Processed items
    """
    if not ('itemize' in s or 'enumerate' in s or 'tablenotes' in s):
        if kwargs.get('pb'):  # Update progressbar
            kwargs.get('pb').update('No item found')
        return s

    def _get_name(e: str) -> str:
        """
        Get the environment name.

        :param e: Environment name
        :return: New name
        """
        # This is due to itemize can contain other symbols or spaces, thus,
        # itemize*    .. is converted to itemize
        if 'itemize' in e:
            return 'itemize'
        if 'enumerate' in e:
            return 'enumerate'
        if 'tablenotes' in e:
            return 'tablenotes'
        return ''

    def _are_item(e: str) -> bool:
        """
        Return true if both are enumerated. Used to check recursive enumerates.

        :param e: Environment name
        :return: True if item
        """
        return e == 'itemize' or e == 'enumerate' or e == 'tablenotes'

    # First, process the nested ones
    while True:
        equal = False
        envs = ut.find_tex_environments(s)
        for tag in envs:
            t, a, b, c, d, t2, _, item_depth = tag
            t, t2 = _get_name(t), _get_name(t2)
            if t == '' or t2 == '':
                continue
            if t == t2 or _are_item(t) and _are_item(t2):
                s = s[0:a] + _process_item(s[b:c].strip(), t, item_depth) + s[d + 2:]
                equal = True
                break
        if not equal:
            break

    # Not nested
    while True:
        conv = False
        envs = ut.find_tex_environments(s)
        for tag in envs:
            t, a, b, c, d, _, _, _ = tag
            t = _get_name(t)
            if t == '':
                continue
            s = s[0:a] + remove_commands_param(_process_item(s[b:c].strip(), t), lang) + s[d + 2:]
            conv = True
            break
        if not conv:
            break

    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Processing item/enumerate environments')
    return s


def _process_item(s: str, t: str, depth: int = 0) -> str:
    """
    Process the items.

    :param s: Latex string code
    :param t: Type (enumerate, itemize)
    :param depth: Depth
    :return: Processed items
    """
    if len(s) == 0:
        return ''
    line = '\n' + _TAG_ITEM_SPACE * (3 * depth)

    def _num(x: int) -> str:
        """
        Get number based on the depth.

        :param x: Number
        :return: Number format by depth
        """
        if depth % 5 == 1:
            x = _int_to_alph(x).lower()
            return f'{line}{x}) '
        elif depth % 5 == 2:
            x = _int_to_roman(x).lower()
            return f'{line}{x}. '
        elif depth % 5 == 3:
            x = _int_to_alph(x).upper()
            return f'{line}{x}) '
        elif depth % 5 == 4:
            x = _int_to_roman(x).upper()
            return f'{line}{x}. '
        return f'{line}{x}. '

    def _itm() -> str:
        """
        :return: The item string based on depth.
        """
        char = ['-', '•', '◦', '■', '*']
        x = char[depth % 5]
        return f'{line}{x} '

    # Remove optional arguments list
    if s[0] == '[':
        sqd = 1
        for j in range(1, len(s)):
            if s[j] == '[':
                sqd += 1
            elif s[j] == ']':
                sqd -= 1
            if sqd == 0:
                s = s[j + 1:len(s)].strip()
                break

    # Remove invalid newlines
    s_ = []
    for v in s.split('\n'):
        v = v.strip()
        if v != '':
            s_.append(v)
    s_ = '\n'.join(s_)
    s = s_

    if t == 'enumerate':
        s += ' ' * 5
        new_s = ''
        k = 1
        j = -1
        while True:
            j += 1
            if s[j:j + 5] == '\\item':
                new_s += _num(k)
                j += 5
                k += 1
            else:
                new_s += s[j]
            if j == len(s) - 5:
                break
    else:
        new_s = s.replace('\\item', _itm())

    # Last operations
    new_s = new_s.replace('\n\n', '\n').strip(' ')
    return new_s


def _int_to_roman(number: int) -> str:
    """
    Convert an arabic integer number to a roman.

    :param number: Number
    :return: Number in roman
    """
    result = ''
    for (arabic, roman) in _ROMAN_DIGITS:
        (factor, number) = divmod(number, arabic)
        result += roman * factor
    return result


def _int_to_alph(n: int) -> str:
    """
    Integer to a..z.

    :param n: Number
    :return: Number in AABB..
    """
    string = ''
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        string = chr(65 + remainder) + string
    return string


[docs]def process_begin_document(s: str, **kwargs) -> str:
    """
    Removes all code outside begin document, if found.

    :param s: Latex code
    :return: Removes all data outside the document
    """
    if '{document}' not in s:
        if kwargs.get('pb'):  # Update progressbar
            kwargs.get('pb').update('No document environment found')
        return s

    s += '          '
    is_env = False
    is_end = False
    is_document_begin = False
    i, j, w = -1, -1, -1  # Init and start of "begin document", w indicates the start of \end
    # Find if begin document exists
    for k in range(len(s) - 10):
        if s[k:k + 6] == '\\begin':
            is_env = True
        elif s[k] == '{' and s[k - 1] != '\\' and is_env and not is_document_begin:
            if s[k:k + 10] == '{document}':
                is_document_begin = True
                i = k + 10
            else:
                is_env = False
        elif is_document_begin and s[k:k + 4] == '\\end':
            is_end = True
            w = k
        elif is_document_begin and is_end and s[k] == '{' and s[k - 1] != '\\':
            if s[k:k + 10] == '{document}':
                break

    # If document has been found
    if kwargs.get('pb'):  # Update progressbar
        kwargs.get('pb').update('Processing {document} environment')
    if -1 < i <= w:
        return s[i:w]
    return s[0:len(s) - 10]