No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

268 líneas
8.9KB

  1. # -*- coding: utf-8 -*-
  2. # Licensed under the Apache License, Version 2.0 (the "License");
  3. # you may not use this file except in compliance with the License.
  4. # You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS,
  10. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  11. # implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Module for the regular expressions crafted from ABNF."""
  15. import sys
  16. # https://tools.ietf.org/html/rfc3986#page-13
  17. GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
  18. GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
  19. # https://tools.ietf.org/html/rfc3986#page-13
  20. SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
  21. SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
  22. # Escape the '*' for use in regular expressions
  23. SUB_DELIMITERS_RE = r"!$&'()\*+,;="
  24. RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
  25. ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
  26. DIGIT = '0123456789'
  27. # https://tools.ietf.org/html/rfc3986#section-2.3
  28. UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r'._!-'
  29. UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
  30. NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET)
  31. # We need to escape the '-' in this case:
  32. UNRESERVED_RE = r'A-Za-z0-9._~\-'
  33. # Percent encoded character values
  34. PERCENT_ENCODED = PCT_ENCODED = '%[A-Fa-f0-9]{2}'
  35. PCHAR = '([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':@]|%s)' % PCT_ENCODED
  36. # NOTE(sigmavirus24): We're going to use more strict regular expressions
  37. # than appear in Appendix B for scheme. This will prevent over-eager
  38. # consuming of items that aren't schemes.
  39. SCHEME_RE = '[a-zA-Z][a-zA-Z0-9+.-]*'
  40. _AUTHORITY_RE = '[^/?#]*'
  41. _PATH_RE = '[^?#]*'
  42. _QUERY_RE = '[^#]*'
  43. _FRAGMENT_RE = '.*'
  44. # Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
  45. COMPONENT_PATTERN_DICT = {
  46. 'scheme': SCHEME_RE,
  47. 'authority': _AUTHORITY_RE,
  48. 'path': _PATH_RE,
  49. 'query': _QUERY_RE,
  50. 'fragment': _FRAGMENT_RE,
  51. }
  52. # See http://tools.ietf.org/html/rfc3986#appendix-B
  53. # In this case, we name each of the important matches so we can use
  54. # SRE_Match#groupdict to parse the values out if we so choose. This is also
  55. # modified to ignore other matches that are not important to the parsing of
  56. # the reference so we can also simply use SRE_Match#groups.
  57. URL_PARSING_RE = (
  58. r'(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?'
  59. r'(?P<path>{path})(?:\?(?P<query>{query}))?'
  60. r'(?:#(?P<fragment>{fragment}))?'
  61. ).format(**COMPONENT_PATTERN_DICT)
  62. # #########################
  63. # Authority Matcher Section
  64. # #########################
  65. # Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
  66. # The pattern for a regular name, e.g., www.google.com, api.github.com
  67. REGULAR_NAME_RE = REG_NAME = '((?:{0}|[{1}])*)'.format(
  68. '%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + UNRESERVED_RE
  69. )
  70. # The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
  71. IPv4_RE = r'([0-9]{1,3}\.){3}[0-9]{1,3}'
  72. # Hexadecimal characters used in each piece of an IPv6 address
  73. HEXDIG_RE = '[0-9A-Fa-f]{1,4}'
  74. # Least-significant 32 bits of an IPv6 address
  75. LS32_RE = '({hex}:{hex}|{ipv4})'.format(hex=HEXDIG_RE, ipv4=IPv4_RE)
  76. # Substitutions into the following patterns for IPv6 patterns defined
  77. # http://tools.ietf.org/html/rfc3986#page-20
  78. _subs = {'hex': HEXDIG_RE, 'ls32': LS32_RE}
  79. # Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
  80. # about ABNF (Augmented Backus-Naur Form) use in the comments
  81. variations = [
  82. # 6( h16 ":" ) ls32
  83. '(%(hex)s:){6}%(ls32)s' % _subs,
  84. # "::" 5( h16 ":" ) ls32
  85. '::(%(hex)s:){5}%(ls32)s' % _subs,
  86. # [ h16 ] "::" 4( h16 ":" ) ls32
  87. '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % _subs,
  88. # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
  89. '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % _subs,
  90. # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
  91. '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % _subs,
  92. # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
  93. '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % _subs,
  94. # [ *4( h16 ":" ) h16 ] "::" ls32
  95. '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % _subs,
  96. # [ *5( h16 ":" ) h16 ] "::" h16
  97. '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % _subs,
  98. # [ *6( h16 ":" ) h16 ] "::"
  99. '((%(hex)s:){0,6}%(hex)s)?::' % _subs,
  100. ]
  101. IPv6_RE = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))'.format(
  102. *variations
  103. )
  104. IPv_FUTURE_RE = r'v[0-9A-Fa-f]+\.[%s]+' % (
  105. UNRESERVED_RE + SUB_DELIMITERS_RE + ':'
  106. )
  107. # RFC 6874 Zone ID ABNF
  108. ZONE_ID = '(?:[' + UNRESERVED_RE + ']|' + PCT_ENCODED + ')+'
  109. IPv6_ADDRZ_RFC4007_RE = IPv6_RE + '(?:(?:%25|%)' + ZONE_ID + ')?'
  110. IPv6_ADDRZ_RE = IPv6_RE + '(?:%25' + ZONE_ID + ')?'
  111. IP_LITERAL_RE = r'\[({0}|{1})\]'.format(
  112. IPv6_ADDRZ_RFC4007_RE,
  113. IPv_FUTURE_RE,
  114. )
  115. # Pattern for matching the host piece of the authority
  116. HOST_RE = HOST_PATTERN = '({0}|{1}|{2})'.format(
  117. REG_NAME,
  118. IPv4_RE,
  119. IP_LITERAL_RE,
  120. )
  121. USERINFO_RE = '^([' + UNRESERVED_RE + SUB_DELIMITERS_RE + ':]|%s)+' % (
  122. PCT_ENCODED
  123. )
  124. PORT_RE = '[0-9]{1,5}'
  125. # ####################
  126. # Path Matcher Section
  127. # ####################
  128. # See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
  129. # about the path patterns defined below.
  130. segments = {
  131. 'segment': PCHAR + '*',
  132. # Non-zero length segment
  133. 'segment-nz': PCHAR + '+',
  134. # Non-zero length segment without ":"
  135. 'segment-nz-nc': PCHAR.replace(':', '') + '+'
  136. }
  137. # Path types taken from Section 3.3 (linked above)
  138. PATH_EMPTY = '^$'
  139. PATH_ROOTLESS = '%(segment-nz)s(/%(segment)s)*' % segments
  140. PATH_NOSCHEME = '%(segment-nz-nc)s(/%(segment)s)*' % segments
  141. PATH_ABSOLUTE = '/(%s)?' % PATH_ROOTLESS
  142. PATH_ABEMPTY = '(/%(segment)s)*' % segments
  143. PATH_RE = '^(%s|%s|%s|%s|%s)$' % (
  144. PATH_ABEMPTY, PATH_ABSOLUTE, PATH_NOSCHEME, PATH_ROOTLESS, PATH_EMPTY
  145. )
  146. FRAGMENT_RE = QUERY_RE = (
  147. '^([/?:@' + UNRESERVED_RE + SUB_DELIMITERS_RE + ']|%s)*$' % PCT_ENCODED
  148. )
  149. # ##########################
  150. # Relative reference matcher
  151. # ##########################
  152. # See http://tools.ietf.org/html/rfc3986#section-4.2 for details
  153. RELATIVE_PART_RE = '(//%s%s|%s|%s|%s)' % (
  154. COMPONENT_PATTERN_DICT['authority'],
  155. PATH_ABEMPTY,
  156. PATH_ABSOLUTE,
  157. PATH_NOSCHEME,
  158. PATH_EMPTY,
  159. )
  160. # See http://tools.ietf.org/html/rfc3986#section-3 for definition
  161. HIER_PART_RE = '(//%s%s|%s|%s|%s)' % (
  162. COMPONENT_PATTERN_DICT['authority'],
  163. PATH_ABEMPTY,
  164. PATH_ABSOLUTE,
  165. PATH_ROOTLESS,
  166. PATH_EMPTY,
  167. )
  168. # ###############
  169. # IRIs / RFC 3987
  170. # ###############
  171. # Only wide-unicode gets the high-ranges of UCSCHAR
  172. if sys.maxunicode > 0xFFFF: # pragma: no cover
  173. IPRIVATE = u'\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD'
  174. UCSCHAR_RE = (
  175. u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF'
  176. u'\U00010000-\U0001FFFD\U00020000-\U0002FFFD'
  177. u'\U00030000-\U0003FFFD\U00040000-\U0004FFFD'
  178. u'\U00050000-\U0005FFFD\U00060000-\U0006FFFD'
  179. u'\U00070000-\U0007FFFD\U00080000-\U0008FFFD'
  180. u'\U00090000-\U0009FFFD\U000A0000-\U000AFFFD'
  181. u'\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD'
  182. u'\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD'
  183. )
  184. else: # pragma: no cover
  185. IPRIVATE = u'\uE000-\uF8FF'
  186. UCSCHAR_RE = (
  187. u'\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF'
  188. )
  189. IUNRESERVED_RE = u'A-Za-z0-9\\._~\\-' + UCSCHAR_RE
  190. IPCHAR = u'([' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':@]|%s)' % PCT_ENCODED
  191. isegments = {
  192. 'isegment': IPCHAR + u'*',
  193. # Non-zero length segment
  194. 'isegment-nz': IPCHAR + u'+',
  195. # Non-zero length segment without ":"
  196. 'isegment-nz-nc': IPCHAR.replace(':', '') + u'+'
  197. }
  198. IPATH_ROOTLESS = u'%(isegment-nz)s(/%(isegment)s)*' % isegments
  199. IPATH_NOSCHEME = u'%(isegment-nz-nc)s(/%(isegment)s)*' % isegments
  200. IPATH_ABSOLUTE = u'/(?:%s)?' % IPATH_ROOTLESS
  201. IPATH_ABEMPTY = u'(?:/%(isegment)s)*' % isegments
  202. IPATH_RE = u'^(?:%s|%s|%s|%s|%s)$' % (
  203. IPATH_ABEMPTY, IPATH_ABSOLUTE, IPATH_NOSCHEME, IPATH_ROOTLESS, PATH_EMPTY
  204. )
  205. IREGULAR_NAME_RE = IREG_NAME = u'(?:{0}|[{1}])*'.format(
  206. u'%[0-9A-Fa-f]{2}', SUB_DELIMITERS_RE + IUNRESERVED_RE
  207. )
  208. IHOST_RE = IHOST_PATTERN = u'({0}|{1}|{2})'.format(
  209. IREG_NAME,
  210. IPv4_RE,
  211. IP_LITERAL_RE,
  212. )
  213. IUSERINFO_RE = u'^(?:[' + IUNRESERVED_RE + SUB_DELIMITERS_RE + u':]|%s)+' % (
  214. PCT_ENCODED
  215. )
  216. IFRAGMENT_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE
  217. + u']|%s)*$' % PCT_ENCODED)
  218. IQUERY_RE = (u'^(?:[/?:@' + IUNRESERVED_RE + SUB_DELIMITERS_RE
  219. + IPRIVATE + u']|%s)*$' % PCT_ENCODED)
  220. IRELATIVE_PART_RE = u'(//%s%s|%s|%s|%s)' % (
  221. COMPONENT_PATTERN_DICT['authority'],
  222. IPATH_ABEMPTY,
  223. IPATH_ABSOLUTE,
  224. IPATH_NOSCHEME,
  225. PATH_EMPTY,
  226. )
  227. IHIER_PART_RE = u'(//%s%s|%s|%s|%s)' % (
  228. COMPONENT_PATTERN_DICT['authority'],
  229. IPATH_ABEMPTY,
  230. IPATH_ABSOLUTE,
  231. IPATH_ROOTLESS,
  232. PATH_EMPTY,
  233. )