# python 3.7.3
import requests
import csv
from bs4 import BeautifulSoup
import re
url = "https://www.brownells.com/ammunition/handgun-ammo/usa-white-box-ammo-380-auto-95gr-fmj-prod95261.aspx"
response = requests.get(url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
src = response.content
soup = BeautifulSoup(src, 'lxml')
price = soup.find("script", {"id": "rawData"})
price = re.search(r'(?<=\\u003cspan\\u003e\$).*(?=\\u003c/span\\u003e\\u003cspan)', price.text)
print(price[0])
# print(price[1])
# There are multiple patterns to match in the string, and I'm planning to pull the 2nd or 3rd one, not just the first.
# For simplicity I'm just pulling the first above.
Expected: 16.99
Actual: 16.99\u003c/span...\u003cspan\u003e$32.99 (rest of the string until the last instance of \u003c/span\u003e\u003cspan)
I tested my regex in regexr and regex101 and it works there:
https://regex101.com/r/yzMkTg/103
I also tried the regex on a string, and it works fine:
import requests
import csv
from bs4 import BeautifulSoup
import re
price = "\\u003cspan\\u003e$321.99\\u003c/span\\u003e\\"
# \u003cspan\u003e$321.99\u003c/span\u003e\
print(price)
price = re.search(r'(?<=\\u003cspan\\u003e\$)(.*)(?=\\u003c/span\\u003e)', price)
print(price[0])
# print(price[1])
# \\u003cspan\\u003e$321.99\\u003c/span\\u003e\\
# View OSHA SDS\u003c/a\u003e\r\n \u003c/section\u003e\r\n
# \u003cspan\u003e$321.99\u003c/span\u003e\
Something with Beautiful Soup seems to be tripping something up and causing it to skip to the last instance of the positive lookahead.
Why is python's regex taking the positive lookbehind but ignoring positive lookahead after the string match until the last instance of the lookahead?