yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
							import re
from typing import (
    Any,
    Dict,
    List,
    Match,
    MutableMapping,
    Optional,
)

from .core import InlineState, Parser
from .helpers import (
    HTML_ATTRIBUTES,
    HTML_TAGNAME,
    PREVENT_BACKSLASH,
    PUNCTUATION,
    parse_link,
    parse_link_label,
    parse_link_text,
    unescape_char,
)
from .util import escape_url, unikey

PAREN_END_RE = re.compile(r"\s*\)")

AUTO_EMAIL = (
    r"""<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]"""
    r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?"
    r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>"
)

INLINE_HTML = (
    r"<" + HTML_TAGNAME + HTML_ATTRIBUTES + r"\s*/?>|"  # open tag
    r"</" + HTML_TAGNAME + r"\s*>|"  # close tag
    r"<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|"  # comment
    r"<\?[\s\S]+?\?>|"  # script like <?php?>
    r"<![A-Z][\s\S]+?>|"  # doctype
    r"<!\[CDATA[\s\S]+?\]\]>"  # cdata
)

EMPHASIS_END_RE = {
    "*": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*(?!\*)"),
    "_": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])_(?!_)\b"),
    "**": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*(?!\*)"),
    "__": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])__(?!_)\b"),
    "***": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*\*(?!\*)"),
    "___": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])___(?!_)\b"),
}


class InlineParser(Parser[InlineState]):
    sc_flag = 0
    state_cls = InlineState

    #: linebreak leaves two spaces at the end of line
    STD_LINEBREAK = r"(?:\\| {2,})\n\s*"

    #: every new line becomes <br>
    HARD_LINEBREAK = r" *\n\s*"

    # we only need to find the start pattern of an inline token
    SPECIFICATION = {
        # e.g. \`, \$
        "escape": r"(?:\\" + PUNCTUATION + ")+",
        # `code, ```code
        "codespan": r"`{1,}",
        # *w, **w, _w, __w
        "emphasis": r"\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])",
        # [link], ![img]
        "link": r"!?\[",
        # <https://example.com>. regex copied from commonmark.js
        "auto_link": r"<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>",
        "auto_email": AUTO_EMAIL,
        "inline_html": INLINE_HTML,
        "linebreak": STD_LINEBREAK,
        "softbreak": HARD_LINEBREAK,
        "prec_auto_link": r"<[A-Za-z][A-Za-z\d.+-]{1,31}:",
        "prec_inline_html": r"</?" + HTML_TAGNAME + r"|<!|<\?",
    }
    DEFAULT_RULES = (
        "escape",
        "codespan",
        "emphasis",
        "link",
        "auto_link",
        "auto_email",
        "inline_html",
        "linebreak",
    )

    def __init__(self, hard_wrap: bool = False) -> None:
        super(InlineParser, self).__init__()

        self.hard_wrap = hard_wrap
        # lazy add linebreak
        if hard_wrap:
            self.specification["linebreak"] = self.HARD_LINEBREAK
        else:
            self.rules.append("softbreak")

        self._methods = {name: getattr(self, "parse_" + name) for name in self.rules}

    def parse_escape(self, m: Match[str], state: InlineState) -> int:
        text = m.group(0)
        text = unescape_char(text)
        state.append_token(
            {
                "type": "text",
                "raw": text,
            }
        )
        return m.end()

    def parse_link(self, m: Match[str], state: InlineState) -> Optional[int]:
        pos = m.end()

        marker = m.group(0)
        is_image = marker[0] == "!"
        if is_image and state.in_image:
            state.append_token({"type": "text", "raw": marker})
            return pos
        elif not is_image and state.in_link:
            state.append_token({"type": "text", "raw": marker})
            return pos

        text = None
        label, end_pos = parse_link_label(state.src, pos)
        if label is None:
            text, end_pos = parse_link_text(state.src, pos)
            if text is None:
                return None

        assert end_pos is not None

        if text is None:
            text = label

        assert text is not None

        if end_pos >= len(state.src) and label is None:
            return None

        rules = ["codespan", "prec_auto_link", "prec_inline_html"]
        prec_pos = self.precedence_scan(m, state, end_pos, rules)
        if prec_pos:
            return prec_pos

        if end_pos < len(state.src):
            c = state.src[end_pos]
            if c == "(":
                # standard link [text](<url> "title")
                attrs, pos2 = parse_link(state.src, end_pos + 1)
                if pos2:
                    token = self.__parse_link_token(is_image, text, attrs, state)
                    state.append_token(token)
                    return pos2

            elif c == "[":
                # standard ref link [text][label]
                label2, pos2 = parse_link_label(state.src, end_pos + 1)
                if pos2:
                    end_pos = pos2
                    if label2:
                        label = label2

        if label is None:
            return None

        ref_links = state.env.get("ref_links")
        if not ref_links:
            return None

        key = unikey(label)
        env = ref_links.get(key)
        if env:
            attrs = {"url": env["url"], "title": env.get("title")}
            token = self.__parse_link_token(is_image, text, attrs, state)
            token["ref"] = key
            token["label"] = label
            state.append_token(token)
            return end_pos
        return None

    def __parse_link_token(
        self,
        is_image: bool,
        text: str,
        attrs: Optional[Dict[str, Any]],
        state: InlineState,
    ) -> Dict[str, Any]:
        new_state = state.copy()
        new_state.src = text
        if is_image:
            new_state.in_image = True
            token = {
                "type": "image",
                "children": self.render(new_state),
                "attrs": attrs,
            }
        else:
            new_state.in_link = True
            token = {
                "type": "link",
                "children": self.render(new_state),
                "attrs": attrs,
            }
        return token

    def parse_auto_link(self, m: Match[str], state: InlineState) -> int:
        text = m.group(0)
        pos = m.end()
        if state.in_link:
            self.process_text(text, state)
            return pos

        text = text[1:-1]
        self._add_auto_link(text, text, state)
        return pos

    def parse_auto_email(self, m: Match[str], state: InlineState) -> int:
        text = m.group(0)
        pos = m.end()
        if state.in_link:
            self.process_text(text, state)
            return pos

        text = text[1:-1]
        url = "mailto:" + text
        self._add_auto_link(url, text, state)
        return pos

    def _add_auto_link(self, url: str, text: str, state: InlineState) -> None:
        state.append_token(
            {
                "type": "link",
                "children": [{"type": "text", "raw": text}],
                "attrs": {"url": escape_url(url)},
            }
        )

    def parse_emphasis(self, m: Match[str], state: InlineState) -> int:
        pos = m.end()

        marker = m.group(0)
        mlen = len(marker)
        if mlen == 1 and state.in_emphasis:
            state.append_token({"type": "text", "raw": marker})
            return pos
        elif mlen == 2 and state.in_strong:
            state.append_token({"type": "text", "raw": marker})
            return pos

        _end_re = EMPHASIS_END_RE[marker]
        m1 = _end_re.search(state.src, pos)
        if not m1:
            state.append_token({"type": "text", "raw": marker})
            return pos

        end_pos = m1.end()
        text = state.src[pos : end_pos - mlen]

        prec_pos = self.precedence_scan(m, state, end_pos)
        if prec_pos:
            return prec_pos

        new_state = state.copy()
        new_state.src = text
        if mlen == 1:
            new_state.in_emphasis = True
            children = self.render(new_state)
            state.append_token({"type": "emphasis", "children": children})
        elif mlen == 2:
            new_state.in_strong = True
            children = self.render(new_state)
            state.append_token({"type": "strong", "children": children})
        else:
            new_state.in_emphasis = True
            new_state.in_strong = True

            children = [{"type": "strong", "children": self.render(new_state)}]
            state.append_token(
                {
                    "type": "emphasis",
                    "children": children,
                }
            )
        return end_pos

    def parse_codespan(self, m: Match[str], state: InlineState) -> int:
        marker = m.group(0)
        # require same marker with same length at end

        pattern = re.compile(r"(.*?[^`])" + marker + r"(?!`)", re.S)

        pos = m.end()
        m2 = pattern.match(state.src, pos)
        if m2:
            end_pos = m2.end()
            code = m2.group(1)
            # Line endings are treated like spaces
            code = code.replace("\n", " ")
            if len(code.strip()):
                if code.startswith(" ") and code.endswith(" "):
                    code = code[1:-1]
            state.append_token({"type": "codespan", "raw": code})
            return end_pos
        else:
            state.append_token({"type": "text", "raw": marker})
            return pos

    def parse_linebreak(self, m: Match[str], state: InlineState) -> int:
        state.append_token({"type": "linebreak"})
        return m.end()

    def parse_softbreak(self, m: Match[str], state: InlineState) -> int:
        state.append_token({"type": "softbreak"})
        return m.end()

    def parse_inline_html(self, m: Match[str], state: InlineState) -> int:
        end_pos = m.end()
        html = m.group(0)
        state.append_token({"type": "inline_html", "raw": html})
        if html.startswith(("<a ", "<a>", "<A ", "<A>")):
            state.in_link = True
        elif html.startswith(("</a ", "</a>", "</A ", "</A>")):
            state.in_link = False
        return end_pos

    def process_text(self, text: str, state: InlineState) -> None:
        state.append_token({"type": "text", "raw": text})

    def parse(self, state: InlineState) -> List[Dict[str, Any]]:
        pos = 0
        sc = self.compile_sc()
        while pos < len(state.src):
            m = sc.search(state.src, pos)
            if not m:
                break

            end_pos = m.start()
            if end_pos > pos:
                hole = state.src[pos:end_pos]
                self.process_text(hole, state)

            new_pos = self.parse_method(m, state)
            if not new_pos:
                # move cursor 1 character forward
                pos = end_pos + 1
                hole = state.src[end_pos:pos]
                self.process_text(hole, state)
            else:
                pos = new_pos

        if pos == 0:
            # special case, just pure text
            self.process_text(state.src, state)
        elif pos < len(state.src):
            self.process_text(state.src[pos:], state)
        return state.tokens

    def precedence_scan(
        self,
        m: Match[str],
        state: InlineState,
        end_pos: int,
        rules: Optional[List[str]] = None,
    ) -> Optional[int]:
        if rules is None:
            rules = ["codespan", "link", "prec_auto_link", "prec_inline_html"]

        mark_pos = m.end()
        sc = self.compile_sc(rules)
        m1 = sc.search(state.src, mark_pos, end_pos)
        if not m1:
            return None

        lastgroup = m1.lastgroup
        if not lastgroup:
            return None
        rule_name = lastgroup.replace("prec_", "")
        sc = self.compile_sc([rule_name])
        m2 = sc.match(state.src, m1.start())
        if not m2:
            return None

        func = self._methods[rule_name]
        new_state = state.copy()
        new_state.src = state.src
        m2_pos = func(m2, new_state)
        if not m2_pos or m2_pos < end_pos:
            return None

        raw_text = state.src[m.start() : m2.start()]
        state.append_token({"type": "text", "raw": raw_text})
        for token in new_state.tokens:
            state.append_token(token)
        return m2_pos

    def render(self, state: InlineState) -> List[Dict[str, Any]]:
        self.parse(state)
        return state.tokens

    def __call__(self, s: str, env: MutableMapping[str, Any]) -> List[Dict[str, Any]]:
        state = self.state_cls(env)
        state.src = s
        return self.render(state)