yichael
/
image-match


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
							import re
import string
from typing import Any, Dict, Tuple, Union

from .util import escape_url

PREVENT_BACKSLASH = r"(?<!\\)(?:\\\\)*"
PUNCTUATION = r"[" + re.escape(string.punctuation) + r"]"

LINK_LABEL = r"(?:[^\\\[\]]|\\.){0,500}"

LINK_BRACKET_START = re.compile(r"[ \t]*\n?[ \t]*<")
LINK_BRACKET_RE = re.compile(r"<([^<>\n\\\x00]*)>")
LINK_HREF_BLOCK_RE = re.compile(r"[ \t]*\n?[ \t]*([^\s]+)(?:\s|$)")
LINK_HREF_INLINE_RE = re.compile(
    r"[ \t]*\n?[ \t]*([^ \t\n]*?)(?:[ \t\n]|"
    r"(?:" + PREVENT_BACKSLASH + r"\)))"
)

LINK_TITLE_RE = re.compile(
    r"[ \t\n]+("
    r'"(?:\\' + PUNCTUATION + r'|[^"\x00])*"|'  # "title"
    r"'(?:\\" + PUNCTUATION + r"|[^'\x00])*'"  # 'title'
    r")"
)
PAREN_END_RE = re.compile(r"\s*\)")

HTML_TAGNAME = r"[A-Za-z][A-Za-z0-9-]*"
HTML_ATTRIBUTES = (
    r"(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*"
    r'(?:\s*=\s*(?:[^ !"\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*'
)

BLOCK_TAGS = (
    "address",
    "article",
    "aside",
    "base",
    "basefont",
    "blockquote",
    "body",
    "caption",
    "center",
    "col",
    "colgroup",
    "dd",
    "details",
    "dialog",
    "dir",
    "div",
    "dl",
    "dt",
    "fieldset",
    "figcaption",
    "figure",
    "footer",
    "form",
    "frame",
    "frameset",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "head",
    "header",
    "hr",
    "html",
    "iframe",
    "legend",
    "li",
    "link",
    "main",
    "menu",
    "menuitem",
    "meta",
    "nav",
    "noframes",
    "ol",
    "optgroup",
    "option",
    "p",
    "param",
    "section",
    "source",
    "summary",
    "table",
    "tbody",
    "td",
    "tfoot",
    "th",
    "thead",
    "title",
    "tr",
    "track",
    "ul",
)
PRE_TAGS = ("pre", "script", "style", "textarea")

_INLINE_LINK_LABEL_RE = re.compile(LINK_LABEL + r"\]")
_INLINE_SQUARE_BRACKET_RE = re.compile(PREVENT_BACKSLASH + r"[\[\]]")
_ESCAPE_CHAR_RE = re.compile(r"\\(" + PUNCTUATION + r")")


def unescape_char(text: str) -> str:
    return _ESCAPE_CHAR_RE.sub(r"\1", text)


def parse_link_text(src: str, pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
    level = 1
    found = False
    start_pos = pos

    while pos < len(src):
        m = _INLINE_SQUARE_BRACKET_RE.search(src, pos)
        if not m:
            break

        pos = m.end()
        marker = m.group(0)
        if marker == "]":
            level -= 1
            if level == 0:
                found = True
                break
        else:
            level += 1

    if found:
        text = src[start_pos : pos - 1]
        return text, pos
    return None, None


def parse_link_label(src: str, start_pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
    m = _INLINE_LINK_LABEL_RE.match(src, start_pos)
    if m:
        label = m.group(0)[:-1]
        return label, m.end()
    return None, None


def parse_link_href(src: str, start_pos: int, block: bool = False) -> Union[Tuple[str, int], Tuple[None, None]]:
    m = LINK_BRACKET_START.match(src, start_pos)
    if m:
        start_pos = m.end() - 1
        m = LINK_BRACKET_RE.match(src, start_pos)
        if m:
            return m.group(1), m.end()
        return None, None

    if block:
        m = LINK_HREF_BLOCK_RE.match(src, start_pos)
    else:
        m = LINK_HREF_INLINE_RE.match(src, start_pos)

    if not m:
        return None, None

    end_pos = m.end()
    href = m.group(1)

    if block and src[end_pos - 1] == href[-1]:
        return href, end_pos
    return href, end_pos - 1


def parse_link_title(src: str, start_pos: int, max_pos: int) -> Union[Tuple[str, int], Tuple[None, None]]:
    m = LINK_TITLE_RE.match(src, start_pos, max_pos)
    if m:
        title = m.group(1)[1:-1]
        title = unescape_char(title)
        return title, m.end()
    return None, None


def parse_link(src: str, pos: int) -> Union[Tuple[Dict[str, Any], int], Tuple[None, None]]:
    href, href_pos = parse_link_href(src, pos)
    if href is None:
        return None, None
    assert href_pos is not None
    title, title_pos = parse_link_title(src, href_pos, len(src))
    next_pos = title_pos or href_pos
    m = PAREN_END_RE.match(src, next_pos)
    if not m:
        return None, None

    href = unescape_char(href)
    attrs = {"url": escape_url(href)}
    if title:
        attrs["title"] = title
    return attrs, m.end()