tokenization_bertweet.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. # Copyright (c) 2020, VinAI Research and the HuggingFace Inc. team.
  2. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Tokenization classes for BERTweet"""
  16. import html
  17. import os
  18. import re
  19. import regex
  20. from ...tokenization_python import PreTrainedTokenizer
  21. from ...utils import logging
  22. logger = logging.get_logger(__name__)
  23. VOCAB_FILES_NAMES = {
  24. "vocab_file": "vocab.txt",
  25. "merges_file": "bpe.codes",
  26. }
  27. def get_pairs(word):
  28. """
  29. Return set of symbol pairs in a word.
  30. Word is represented as tuple of symbols (symbols being variable-length strings).
  31. """
  32. pairs = set()
  33. prev_char = word[0]
  34. for char in word[1:]:
  35. pairs.add((prev_char, char))
  36. prev_char = char
  37. pairs = set(pairs)
  38. return pairs
  39. class BertweetTokenizer(PreTrainedTokenizer):
  40. """
  41. Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.
  42. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
  43. this superclass for more information regarding those methods.
  44. Args:
  45. vocab_file (`str`):
  46. Path to the vocabulary file.
  47. merges_file (`str`):
  48. Path to the merges file.
  49. normalization (`bool`, *optional*, defaults to `False`):
  50. Whether or not to apply a normalization preprocess.
  51. bos_token (`str`, *optional*, defaults to `"<s>"`):
  52. The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
  53. <Tip>
  54. When building a sequence using special tokens, this is not the token that is used for the beginning of
  55. sequence. The token used is the `cls_token`.
  56. </Tip>
  57. eos_token (`str`, *optional*, defaults to `"</s>"`):
  58. The end of sequence token.
  59. <Tip>
  60. When building a sequence using special tokens, this is not the token that is used for the end of sequence.
  61. The token used is the `sep_token`.
  62. </Tip>
  63. sep_token (`str`, *optional*, defaults to `"</s>"`):
  64. The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
  65. sequence classification or for a text and a question for question answering. It is also used as the last
  66. token of a sequence built with special tokens.
  67. cls_token (`str`, *optional*, defaults to `"<s>"`):
  68. The classifier token which is used when doing sequence classification (classification of the whole sequence
  69. instead of per-token classification). It is the first token of the sequence when built with special tokens.
  70. unk_token (`str`, *optional*, defaults to `"<unk>"`):
  71. The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
  72. token instead.
  73. pad_token (`str`, *optional*, defaults to `"<pad>"`):
  74. The token used for padding, for example when batching sequences of different lengths.
  75. mask_token (`str`, *optional*, defaults to `"<mask>"`):
  76. The token used for masking values. This is the token used when training this model with masked language
  77. modeling. This is the token which the model will try to predict.
  78. """
  79. vocab_files_names = VOCAB_FILES_NAMES
  80. def __init__(
  81. self,
  82. vocab_file,
  83. merges_file,
  84. normalization=False,
  85. bos_token="<s>",
  86. eos_token="</s>",
  87. sep_token="</s>",
  88. cls_token="<s>",
  89. unk_token="<unk>",
  90. pad_token="<pad>",
  91. mask_token="<mask>",
  92. **kwargs,
  93. ):
  94. try:
  95. from emoji import demojize
  96. self.demojizer = demojize
  97. except ImportError:
  98. logger.warning(
  99. "emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3"
  100. " install emoji==0.6.0"
  101. )
  102. self.demojizer = None
  103. self.vocab_file = vocab_file
  104. self.merges_file = merges_file
  105. self.encoder = {}
  106. self.encoder[str(bos_token)] = 0
  107. self.encoder[str(pad_token)] = 1
  108. self.encoder[str(eos_token)] = 2
  109. self.encoder[str(unk_token)] = 3
  110. self.add_from_file(vocab_file)
  111. self.decoder = {v: k for k, v in self.encoder.items()}
  112. with open(merges_file, encoding="utf-8") as merges_handle:
  113. merges = merges_handle.read().split("\n")[:-1]
  114. merges = [tuple(merge.split()[:-1]) for merge in merges]
  115. self.bpe_ranks = dict(zip(merges, range(len(merges))))
  116. self.cache = {}
  117. self.normalization = normalization
  118. self.tweetPreprocessor = TweetTokenizer()
  119. self.special_puncts = {"’": "'", "…": "..."}
  120. super().__init__(
  121. normalization=normalization,
  122. bos_token=bos_token,
  123. eos_token=eos_token,
  124. sep_token=sep_token,
  125. cls_token=cls_token,
  126. unk_token=unk_token,
  127. pad_token=pad_token,
  128. mask_token=mask_token,
  129. # Configure patterns instead of overriding methods
  130. token_type_ids_pattern="all_zeros", # BERTweet doesn't use token type IDs
  131. token_type_ids_include_special_tokens=True,
  132. special_tokens_pattern="cls_double_sep", # <s> X </s></s> Y </s>
  133. **kwargs,
  134. )
  135. @property
  136. def vocab_size(self):
  137. return len(self.encoder)
  138. def get_vocab(self):
  139. return dict(self.encoder, **self.added_tokens_encoder)
  140. def bpe(self, token):
  141. if token in self.cache:
  142. return self.cache[token]
  143. word = tuple(token)
  144. word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
  145. pairs = get_pairs(word)
  146. if not pairs:
  147. return token
  148. while True:
  149. bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
  150. if bigram not in self.bpe_ranks:
  151. break
  152. first, second = bigram
  153. new_word = []
  154. i = 0
  155. while i < len(word):
  156. try:
  157. j = word.index(first, i)
  158. except ValueError:
  159. new_word.extend(word[i:])
  160. break
  161. else:
  162. new_word.extend(word[i:j])
  163. i = j
  164. if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
  165. new_word.append(first + second)
  166. i += 2
  167. else:
  168. new_word.append(word[i])
  169. i += 1
  170. new_word = tuple(new_word)
  171. word = new_word
  172. if len(word) == 1:
  173. break
  174. else:
  175. pairs = get_pairs(word)
  176. word = "@@ ".join(word)
  177. word = word[:-4]
  178. self.cache[token] = word
  179. return word
  180. def _tokenize(self, text):
  181. """Tokenize a string."""
  182. if self.normalization: # Perform Tweet normalization before performing BPE
  183. text = self.normalizeTweet(text)
  184. split_tokens = []
  185. words = re.findall(r"\S+\n?", text)
  186. for token in words:
  187. split_tokens.extend(list(self.bpe(token).split(" ")))
  188. return split_tokens
  189. def normalizeTweet(self, tweet):
  190. """
  191. Normalize a raw Tweet
  192. """
  193. for punct in self.special_puncts:
  194. tweet = tweet.replace(punct, self.special_puncts[punct])
  195. tokens = self.tweetPreprocessor.tokenize(tweet)
  196. normTweet = " ".join([self.normalizeToken(token) for token in tokens])
  197. normTweet = (
  198. normTweet.replace("cannot ", "can not ")
  199. .replace("n't ", " n't ")
  200. .replace("n 't ", " n't ")
  201. .replace("ca n't", "can't")
  202. .replace("ai n't", "ain't")
  203. )
  204. normTweet = (
  205. normTweet.replace("'m ", " 'm ")
  206. .replace("'re ", " 're ")
  207. .replace("'s ", " 's ")
  208. .replace("'ll ", " 'll ")
  209. .replace("'d ", " 'd ")
  210. .replace("'ve ", " 've ")
  211. )
  212. normTweet = (
  213. normTweet.replace(" p . m .", " p.m.")
  214. .replace(" p . m ", " p.m ")
  215. .replace(" a . m .", " a.m.")
  216. .replace(" a . m ", " a.m ")
  217. )
  218. return " ".join(normTweet.split())
  219. def normalizeToken(self, token):
  220. """
  221. Normalize tokens in a Tweet
  222. """
  223. lowercased_token = token.lower()
  224. if token.startswith("@"):
  225. return "@USER"
  226. elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
  227. return "HTTPURL"
  228. elif len(token) == 1:
  229. if token in self.special_puncts:
  230. return self.special_puncts[token]
  231. if self.demojizer is not None:
  232. return self.demojizer(token)
  233. else:
  234. return token
  235. else:
  236. return token
  237. def _convert_token_to_id(self, token):
  238. """Converts a token (str) in an id using the vocab."""
  239. return self.encoder.get(token, self.encoder.get(self.unk_token))
  240. def _convert_id_to_token(self, index):
  241. """Converts an index (integer) in a token (str) using the vocab."""
  242. return self.decoder.get(index, self.unk_token)
  243. def convert_tokens_to_string(self, tokens):
  244. """Converts a sequence of tokens (string) in a single string."""
  245. out_string = " ".join(tokens).replace("@@ ", "").strip()
  246. return out_string
  247. # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
  248. # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
  249. # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
  250. # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
  251. # return ''.join(tokens_generated_so_far)
  252. def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
  253. """
  254. Save the vocabulary and merges files to a directory.
  255. """
  256. if not os.path.isdir(save_directory):
  257. logger.error(f"Vocabulary path ({save_directory}) should be a directory")
  258. return ()
  259. vocab_files_names = getattr(self, "vocab_files_names", {})
  260. prefix = f"{filename_prefix}-" if filename_prefix else ""
  261. # Save vocabulary in the format expected by add_from_file: <token> <id>
  262. # Exclude special tokens (IDs 0-3) as they are added in __init__ before add_from_file
  263. vocab_file = os.path.join(save_directory, prefix + vocab_files_names.get("vocab_file", "vocab.txt"))
  264. with open(vocab_file, "w", encoding="utf-8") as f:
  265. for token, token_id in sorted(self.encoder.items(), key=lambda kv: kv[1]):
  266. # Only save tokens with ID >= 4, as IDs 0-3 are reserved for special tokens
  267. if token_id >= 4:
  268. f.write(f"{token} {token_id}\n")
  269. # Save BPE merges
  270. merge_file = os.path.join(save_directory, prefix + vocab_files_names.get("merges_file", "bpe.codes"))
  271. with open(merge_file, "w", encoding="utf-8") as writer:
  272. writer.writelines(
  273. " ".join(bpe_tokens) + "\n"
  274. for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1])
  275. )
  276. return (vocab_file, merge_file)
  277. def add_from_file(self, f):
  278. """
  279. Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
  280. """
  281. if isinstance(f, str):
  282. try:
  283. with open(f, "r", encoding="utf-8") as fd:
  284. self.add_from_file(fd)
  285. except FileNotFoundError as fnfe:
  286. raise fnfe
  287. except UnicodeError:
  288. raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
  289. return
  290. lines = f.readlines()
  291. for lineTmp in lines:
  292. line = lineTmp.strip()
  293. idx = line.rfind(" ")
  294. if idx == -1:
  295. raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
  296. word = line[:idx]
  297. self.encoder[word] = len(self.encoder)
  298. # Natural Language Toolkit: Twitter Tokenizer
  299. #
  300. # Copyright (C) 2001-2020 NLTK Project
  301. # Author: Christopher Potts <cgpotts@stanford.edu>
  302. # Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
  303. # Pierpaolo Pantone <> (modifications)
  304. # URL: http://nltk.org/
  305. # For license information, see LICENSE.TXT
  306. #
  307. """
  308. Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:
  309. 1. The tuple regex_strings defines a list of regular expression strings.
  310. 2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.
  311. 3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of
  312. the class Tokenizer.
  313. 4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
  314. is set to False, then the tokenizer will lowercase everything except for emoticons.
  315. """
  316. ######################################################################
  317. #
  318. # import regex # https://github.com/nltk/nltk/issues/2409
  319. # import html
  320. #
  321. ######################################################################
  322. # The following strings are components in the regular expression
  323. # that is used for tokenizing. It's important that phone_number
  324. # appears first in the final regex (since it can contain whitespace).
  325. # It also could matter that tags comes after emoticons, due to the
  326. # possibility of having text like
  327. #
  328. # <:| and some text >:)
  329. #
  330. # Most importantly, the final element should always be last, since it
  331. # does a last ditch whitespace-based tokenization of whatever is left.
  332. # ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?
  333. # This particular element is used in a couple ways, so we define it
  334. # with a name:
  335. # docstyle-ignore
  336. EMOTICONS = r"""
  337. (?:
  338. [<>]?
  339. [:;=8] # eyes
  340. [\-o\*\']? # optional nose
  341. [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
  342. |
  343. [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
  344. [\-o\*\']? # optional nose
  345. [:;=8] # eyes
  346. [<>]?
  347. |
  348. <3 # heart
  349. )"""
  350. # URL pattern due to John Gruber, modified by Tom Winzig. See
  351. # https://gist.github.com/winzig/8894715
  352. # docstyle-ignore
  353. URLS = r""" # Capture 1: entire matched URL
  354. (?:
  355. https?: # URL protocol and colon
  356. (?:
  357. /{1,3} # 1-3 slashes
  358. | # or
  359. [a-z0-9%] # Single letter or digit or '%'
  360. # (Trying not to match e.g. "URI::Escape")
  361. )
  362. | # or
  363. # looks like domain name followed by a slash:
  364. [a-z0-9.\-]+[.]
  365. (?:[a-z]{2,13})
  366. /
  367. )
  368. (?: # One or more:
  369. [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
  370. | # or
  371. \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
  372. |
  373. \([^\s]+?\) # balanced parens, non-recursive: (...)
  374. )+
  375. (?: # End with:
  376. \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
  377. |
  378. \([^\s]+?\) # balanced parens, non-recursive: (...)
  379. | # or
  380. [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
  381. )
  382. | # OR, the following to match naked domains:
  383. (?:
  384. (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
  385. [a-z0-9]+
  386. (?:[.\-][a-z0-9]+)*
  387. [.]
  388. (?:[a-z]{2,13})
  389. \b
  390. /?
  391. (?!@) # not succeeded by a @,
  392. # avoid matching "foo.na" in "foo.na@example.com"
  393. )
  394. """
  395. # docstyle-ignore
  396. # The components of the tokenizer:
  397. REGEXPS = (
  398. URLS,
  399. # Phone numbers:
  400. r"""
  401. (?:
  402. (?: # (international)
  403. \+?[01]
  404. [ *\-.\)]*
  405. )?
  406. (?: # (area code)
  407. [\(]?
  408. \d{3}
  409. [ *\-.\)]*
  410. )?
  411. \d{3} # exchange
  412. [ *\-.\)]*
  413. \d{4} # base
  414. )""",
  415. # ASCII Emoticons
  416. EMOTICONS,
  417. # HTML tags:
  418. r"""<[^>\s]+>""",
  419. # ASCII Arrows
  420. r"""[\-]+>|<[\-]+""",
  421. # Twitter username:
  422. r"""(?:@[\w_]+)""",
  423. # Twitter hashtags:
  424. r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
  425. # email addresses
  426. r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
  427. # docstyle-ignore
  428. # Remaining word types:
  429. r"""
  430. (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
  431. |
  432. (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
  433. |
  434. (?:[\w_]+) # Words without apostrophes or dashes.
  435. |
  436. (?:\.(?:\s*\.){1,}) # Ellipsis dots.
  437. |
  438. (?:\S) # Everything else that isn't whitespace.
  439. """,
  440. )
  441. ######################################################################
  442. # This is the core tokenizing regex:
  443. WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
  444. # WORD_RE performs poorly on these patterns:
  445. HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
  446. # The emoticon string gets its own regex so that we can preserve case for
  447. # them as needed:
  448. EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
  449. # These are for regularizing HTML entities to Unicode:
  450. ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
  451. ######################################################################
  452. # Functions for converting html entities
  453. ######################################################################
  454. def _str_to_unicode(text, encoding=None, errors="strict"):
  455. if encoding is None:
  456. encoding = "utf-8"
  457. if isinstance(text, bytes):
  458. return text.decode(encoding, errors)
  459. return text
  460. def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
  461. """
  462. Remove entities from text by converting them to their corresponding unicode character.
  463. Args:
  464. text:
  465. A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
  466. keep (list):
  467. List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
  468. `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
  469. remove_illegal (bool):
  470. If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
  471. kept "as is".
  472. Returns: A unicode string with the entities removed.
  473. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
  474. Examples:
  475. ```python
  476. >>> from nltk.tokenize.casual import _replace_html_entities
  477. >>> _replace_html_entities(b"Price: &pound;100")
  478. 'Price: \\xa3100'
  479. >>> print(_replace_html_entities(b"Price: &pound;100"))
  480. Price: £100
  481. ```"""
  482. def _convert_entity(match):
  483. entity_body = match.group(3)
  484. if match.group(1):
  485. try:
  486. if match.group(2):
  487. number = int(entity_body, 16)
  488. else:
  489. number = int(entity_body, 10)
  490. # Numeric character references in the 80-9F range are typically
  491. # interpreted by browsers as representing the characters mapped
  492. # to bytes 80-9F in the Windows-1252 encoding. For more info
  493. # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
  494. if 0x80 <= number <= 0x9F:
  495. return bytes((number,)).decode("cp1252")
  496. except ValueError:
  497. number = None
  498. else:
  499. if entity_body in keep:
  500. return match.group(0)
  501. else:
  502. number = html.entities.name2codepoint.get(entity_body)
  503. if number is not None:
  504. try:
  505. return chr(number)
  506. except (ValueError, OverflowError):
  507. pass
  508. return "" if remove_illegal else match.group(0)
  509. return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
  510. ######################################################################
  511. class TweetTokenizer:
  512. r"""
  513. Examples:
  514. ```python
  515. >>> # Tokenizer for tweets.
  516. >>> from nltk.tokenize import TweetTokenizer
  517. >>> tknzr = TweetTokenizer()
  518. >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
  519. >>> tknzr.tokenize(s0)
  520. ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
  521. >>> # Examples using *strip_handles* and *reduce_len parameters*:
  522. >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
  523. >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
  524. >>> tknzr.tokenize(s1)
  525. [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
  526. ```"""
  527. def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
  528. self.preserve_case = preserve_case
  529. self.reduce_len = reduce_len
  530. self.strip_handles = strip_handles
  531. def tokenize(self, text):
  532. """
  533. Args:
  534. text: str
  535. Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
  536. `preserve_case=False`
  537. """
  538. # Fix HTML character entities:
  539. text = _replace_html_entities(text)
  540. # Remove username handles
  541. if self.strip_handles:
  542. text = remove_handles(text)
  543. # Normalize word lengthening
  544. if self.reduce_len:
  545. text = reduce_lengthening(text)
  546. # Shorten problematic sequences of characters
  547. safe_text = HANG_RE.sub(r"\1\1\1", text)
  548. # Tokenize:
  549. words = WORD_RE.findall(safe_text)
  550. # Possibly alter the case, but avoid changing emoticons like :D into :d:
  551. if not self.preserve_case:
  552. words = [x if EMOTICON_RE.search(x) else x.lower() for x in words]
  553. return words
  554. ######################################################################
  555. # Normalization Functions
  556. ######################################################################
  557. def reduce_lengthening(text):
  558. """
  559. Replace repeated character sequences of length 3 or greater with sequences of length 3.
  560. """
  561. pattern = regex.compile(r"(.)\1{2,}")
  562. return pattern.sub(r"\1\1\1", text)
  563. def remove_handles(text):
  564. """
  565. Remove Twitter username handles from text.
  566. """
  567. pattern = regex.compile(
  568. r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
  569. )
  570. # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
  571. return pattern.sub(" ", text)
  572. ######################################################################
  573. # Tokenization Function
  574. ######################################################################
  575. def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
  576. """
  577. Convenience function for wrapping the tokenizer.
  578. """
  579. return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles).tokenize(
  580. text
  581. )
  582. ###############################################################################
  583. __all__ = ["BertweetTokenizer"]