0

I am a Python beginner. Python versions 3.8 and 3.9

In an existing URL validation code, I ran into issues with the password containing '['. The password is PN-[.d.g5(R{bK}[5ZLx,4~K*hHrSy32=q+

URL:

"https://p124_ddm028127:PN-[.d.g5(R{bK}[5ZLx,4~K*hHrSy32=q+@git.net/scm/sample-config.git"

The code that is failing uses the validators==0.18.1 package:

if validators.url(url):
//other code

I checked the regex used by the validators library, it uses the following for username and password:

# user:pass authentication
r"(?:[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:]+"
r"(?::[-a-z0-9._~%!$&'()*+,;=:]*)?@)?"

I decided to write a simple test using the above regex but adding the square brackets as valid input. I tried the following:

  • I read many threads on SOF and other places which suggested using a '' to escape the square brackets. This didn't work.
#user:pass authentication
r"(?:[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:]+"
r"(?::[-a-z0-9._\[\]~%!$&'()*+,;=:]*)?@)?"
  • Adding the Unicode values of the square brackets, in vain.
  • Instead of validators, tried urllib.urlparse
def url_parse(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

Any suggestions?

Wiktor Stribiżew
  • 607,720
  • 39
  • 448
  • 563
Kaliyug Antagonist
  • 3,512
  • 9
  • 51
  • 103
  • 1
    It seems the curly braces are culprits, too. You need to replace `[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:]+` with `[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:{}[\]]+` and probably `(?::[-a-z0-9._~%!$&'()*+,;=:]*)?@)?` with `(?::[-a-z0-9._~%!$&'()*+,;=:{}[\]]*)?@)?`, see [this regex demo](https://regex101.com/r/gP4ajg/2) and [this Python demo](https://ideone.com/IMAbQ5). – Wiktor Stribiżew Jun 02 '21 at 13:02
  • During the tests, I realized it fails even for many other special characters, needs to be changed a lot. – Kaliyug Antagonist Jun 03 '21 at 05:26

1 Answers1

0

There also curly braces in problematic password, so you need to add them too.

r"(?::[-a-z0-9._\[\]\{\}~%!$&'()*+,;=:]*)?@)?"

import re

ip_middle_octet = r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5]))"
ip_last_octet = r"(?:\.(?:0|[1-9]\d?|1\d\d|2[0-4]\d|25[0-5]))"

regex = re.compile(  # noqa: W605
    r"^"
    # protocol identifier
    r"(?:(?:https?|ftp)://)"
    # user:pass authentication
    r"(?:[-a-z\u00a1-\uffff0-9._~%!$&'()*+,;=:]+"
    r"(?::[-a-z0-9._\[\]\{\}~%!$&'()*+,;=:]*)?@)?"
    r"(?:"
    r"(?P<private_ip>"
    # IP address exclusion
    # private & local networks
    r"(?:(?:10|127)" + ip_middle_octet + r"{2}" + ip_last_octet + r")|"
    r"(?:(?:169\.254|192\.168)" + ip_middle_octet + ip_last_octet + r")|"
    r"(?:172\.(?:1[6-9]|2\d|3[0-1])" + ip_middle_octet + ip_last_octet + r"))"
    r"|"
    # private & local hosts
    r"(?P<private_host>"
    r"(?:localhost))"
    r"|"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    r"(?P<public_ip>"
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"" + ip_middle_octet + r"{2}"
    r"" + ip_last_octet + r")"
    r"|"
    # IPv6 RegEx from https://stackoverflow.com/a/17871737
    r"\[("
    # 1:2:3:4:5:6:7:8
    r"([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|"
    # 1::                              1:2:3:4:5:6:7::
    r"([0-9a-fA-F]{1,4}:){1,7}:|"
    # 1::8             1:2:3:4:5:6::8  1:2:3:4:5:6::8
    r"([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|"
    # 1::7:8           1:2:3:4:5::7:8  1:2:3:4:5::8
    r"([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|"
    # 1::6:7:8         1:2:3:4::6:7:8  1:2:3:4::8
    r"([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|"
    # 1::5:6:7:8       1:2:3::5:6:7:8  1:2:3::8
    r"([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|"
    # 1::4:5:6:7:8     1:2::4:5:6:7:8  1:2::8
    r"([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|"
    # 1::3:4:5:6:7:8   1::3:4:5:6:7:8  1::8
    r"[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|"
    # ::2:3:4:5:6:7:8  ::2:3:4:5:6:7:8 ::8       ::
    r":((:[0-9a-fA-F]{1,4}){1,7}|:)|"
    # fe80::7:8%eth0   fe80::7:8%1
    # (link-local IPv6 addresses with zone index)
    r"fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|"
    r"::(ffff(:0{1,4}){0,1}:){0,1}"
    r"((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}"
    # ::255.255.255.255   ::ffff:255.255.255.255  ::ffff:0:255.255.255.255
    # (IPv4-mapped IPv6 addresses and IPv4-translated addresses)
    r"(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|"
    r"([0-9a-fA-F]{1,4}:){1,4}:"
    r"((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}"
    # 2001:db8:3:4::192.0.2.33  64:ff9b::192.0.2.33
    # (IPv4-Embedded IPv6 Address)
    r"(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])"
    r")\]|"
    # host name
    r"(?:(?:(?:xn--)|[a-z\u00a1-\uffff\U00010000-\U0010ffff0-9]-?)*"
    r"[a-z\u00a1-\uffff\U00010000-\U0010ffff0-9]+)"
    # domain name
    r"(?:\.(?:(?:xn--)|[a-z\u00a1-\uffff\U00010000-\U0010ffff0-9]-?)*"
    r"[a-z\u00a1-\uffff\U00010000-\U0010ffff0-9]+)*"
    # TLD identifier
    r"(?:\.(?:(?:xn--[a-z\u00a1-\uffff\U00010000-\U0010ffff0-9]{2,})|"
    r"[a-z\u00a1-\uffff\U00010000-\U0010ffff]{2,}))"
    r")"
    # port number
    r"(?::\d{2,5})?"
    # resource path
    r"(?:/[-a-z\u00a1-\uffff\U00010000-\U0010ffff0-9._~%!$&'()*+,;=:@/]*)?"
    # query string
    r"(?:\?\S*)?"
    # fragment
    r"(?:#\S*)?"
    r"$",
    re.UNICODE | re.IGNORECASE
)

assert regex.match("http://foo:bar@example.com")
assert regex.match("http://foo:b[a]r@example.com")
assert regex.match("http://foo:PN-[.d.g5(R{bK}[5ZLx,4~K*hHrSy32=q+@example.com")
mugiseyebrows
  • 4,138
  • 1
  • 14
  • 15