rfc3986_validator.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. import re
  2. __version__ = '0.1.1'
  3. __author__ = 'Nicolas Aimetti <naimetti@onapsis.com>'
  4. __all__ = ['validate_rfc3986']
  5. # Following regex rules references the ABNF terminology from
  6. # [RFC3986](https://tools.ietf.org/html/rfc3986#appendix-A)
  7. # IPv6 validation rule
  8. IPv6_RE = (
  9. r"(?:(?:[0-9A-Fa-f]{1,4}:){6}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9]["
  10. r"0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|::(?:[0-9A-Fa-f]{1,4}:){5}(?:[0-9A-Fa-f]{1,"
  11. r"4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9]["
  12. r"0-9]?))|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2["
  13. r"0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,"
  14. r"4}:)?[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4]["
  15. r"0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){,"
  16. r"2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4]["
  17. r"0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){,"
  18. r"3}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:)(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|["
  19. r"01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){,4}[0-9A-Fa-f]{1,"
  20. r"4})?::(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2["
  21. r"0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:["
  22. r"0-9A-Fa-f]{1,4}:){,6}[0-9A-Fa-f]{1,4})?::)"
  23. )
  24. # An authority is defined as: [ userinfo "@" ] host [ ":" port ]
  25. # \[(?:{ip_v6} | v[0-9A-Fa-f]+\.[a-zA-Z0-9_.~\-!$ & '()*+,;=:]+)\] # IP-literal
  26. AUTHORITY_RE = r"""
  27. (?:(?:[a-zA-Z0-9_.~\-!$&'()*+,;=:]|%[0-9A-Fa-f]{{2}})*@)? # user info
  28. (?:
  29. \[(?:{ip_v6}|v[0-9A-Fa-f]+\.[a-zA-Z0-9_.~\-!$&'()*+,;=:]+)\] # IP-literal
  30. | (?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){{3}}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) # IPv4
  31. | (?:[a-zA-Z0-9_.~\-!$&'()*+,;=]|%[0-9A-Fa-f]{{2}})* # reg-name
  32. ) # host
  33. (?::[0-9]*)? # port
  34. """.format(ip_v6=IPv6_RE,)
  35. # Path char regex rule
  36. PCHAR_RE = r"(?:[a-zA-Z0-9_.~\-!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})"
  37. # Query and Fragment rules are exactly the same
  38. QUERY_RE = r"(?:[a-zA-Z0-9_.~\-!$&'()*+,;=:@/?]|%[0-9A-Fa-f]{2})*"
  39. # An URI is defined as: scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  40. URI_RE = r"""
  41. [a-zA-Z][a-zA-Z0-9+.-]* #scheme
  42. :
  43. (?:
  44. //
  45. {authority}
  46. (?:/{pchar}*)* # path-abempty
  47. | /(?:{pchar}+ (?:/{pchar}*)*)? # path-absolute
  48. | {pchar}+ (?:/{pchar}*)* # path-rootless
  49. | # or nothing
  50. ) # hier-part
  51. (?:\?{query})? # Query
  52. (?:\#{fragment})? # Fragment
  53. """.format(
  54. authority=AUTHORITY_RE,
  55. query=QUERY_RE,
  56. fragment=QUERY_RE,
  57. pchar=PCHAR_RE
  58. )
  59. # A relative-ref is defined as: relative-part [ "?" query ] [ "#" fragment ]
  60. RELATIVE_REF_RE = r"""
  61. (?:
  62. //
  63. {authority}
  64. (?:/{pchar}*)* # path-abempty
  65. | /(?:{pchar}+ (?:/{pchar}*)*)? # path-absolute
  66. | (?:[a-zA-Z0-9_.~\-!$&'()*+,;=@]|%[0-9A-Fa-f]{{2}})+ (?:/{pchar}*)* # path-noscheme
  67. | # or nothing
  68. ) # relative-part
  69. (?:\?{query})? # Query
  70. (?:\#{fragment})? # Fragment
  71. """.format(
  72. authority=AUTHORITY_RE,
  73. query=QUERY_RE,
  74. fragment=QUERY_RE,
  75. pchar=PCHAR_RE
  76. )
  77. # Compiled URI regex rule
  78. URI_RE_COMP = re.compile(r"^{uri_re}$".format(uri_re=URI_RE), re.VERBOSE)
  79. # Compiled URI-reference regex rule. URI-reference is defined as: URI / relative-ref
  80. URI_REF_RE_COMP = re.compile(r"^(?:{uri_re}|{relative_ref})$".format(
  81. uri_re=URI_RE,
  82. relative_ref=RELATIVE_REF_RE,
  83. ), re.VERBOSE)
  84. def validate_rfc3986(url, rule='URI'):
  85. """
  86. Validates strings according to RFC3986
  87. :param url: String cointaining URI to validate
  88. :param rule: It could be 'URI' (default) or 'URI_reference'.
  89. :return: True or False
  90. """
  91. if rule == 'URI':
  92. return URI_RE_COMP.match(url)
  93. elif rule == 'URI_reference':
  94. return URI_REF_RE_COMP.match(url)
  95. else:
  96. raise ValueError('Invalid rule')