I have a script that stores XML (saved as str
's) requested from various websites. Some of these requests are being truncated even though the SQL column type is NVARCHAR(MAX)
. I need to know why. I'm not sure if its a timing issue (The response is stored to SQL before its finished downloading) or a paging(?) issue.
The issue presents itself around 62k to 65k characters and above. Anything below this char count and the file is saved appropriately.
Code:
import urllib.error
import urllib.request
from bs4 import BeautifulSoup
def get_feed(url):
try:
header = {'User-Agent': 'Mozilla/5.0'}
request = urllib.request.Request(url=url, headers=header)
response = urllib.request.urlopen(request)
xml = BeautifulSoup(response, 'xml')
return xml
except urllib.error.HTTPError as response:
return None
def get_sql_connection():
try:
server = 'mydb.database.windows.net'
database = 'mydb'
driver= '{ODBC Driver 17 for SQL Server}'
conn_string = 'DRIVER=' + driver + ';SERVER=' + server + ';DATABASE=' + database
conn = pyodbc.connect(conn_string + ';Authentication=ActiveDirectoryMsi')
return conn
except Exception as e:
logging.error(e)
return None
def write_xml(conn, xml):
end_format = '%Y-%m-%dT%H:%M:%S%z'
cursor = conn.cursor()
data = {
"xml_doc": str(xml),
"created": datetime.now(timezone(timedelta(hours=-8))).strftime(end_format)}
cursor.execute("INSERT INTO dbo.XML (xml_doc, created) VALUES (?,?)", data['xml_doc'], data['created'])
conn.commit()
#-------------------------
conn = get_sql_connection()
url = 'https://example.com'
xml = get_feed(url)
write = write_feed_table(conn, xml)
Again... anything beyond char count of ~62k-65k, the xml
is truncated.