1

I am working on script which downloads large audit logs csv file from azure DevOps and filters data according given condition. This works for small csv file but for file with large data it fails with

fields = next(reader) stopIteration

Can someone help with changes required in script? I am using python 3.7.9 on MacOs

def getproject(url,pat):

    response = requests.get(url, auth=HTTPBasicAuth(username='',password=pat))

    if response.status_code == 200:
        url_data = response.content
        tempfile = open("temp.csv","wb")
        tempfile.write(url_data)
        tempfile.close()
        return url_data

    else:
        print("\nERROR : Unable to conect The server...")


def FilterData():
    lists   =[]
    pro_name=[]
    RepoId  =[]
    RepoName=[]

    new_file = open("temp_new.csv", 'w',newline='')
    writer = csv.writer(new_file)
    with open("temp.csv", 'r') as readFile:
    reader = csv.reader(readFile)
    fields = next(reader) 
    lists.append(fields)
    for row in reader:
        for field in row:
            if field == "Git.RepositoryCreated":
                lists.append(row)
    writer.writerows(lists) 
    readFile.close()
    new_file.close()
    os.remove("temp.csv")

    timestamp = (datetime.datetime.now())
    timestamp = timestamp.strftime("%d%B%Y_%H%M%S") 
    file_name = "Data2_"+str(timestamp)+".csv"

    file1 = open("temp_new.csv",'r')
    df = pd.read_csv(file1)
    for i in df["Data"]:
       res = json.loads(i) 
       pro_name.append(res['ProjectName'])
       RepoId.append(res['RepoId'])
       RepoName.append(res['RepoName'])
    Disp_Name = df["ActorDisplayName"]
    ActionId  = df["ActionId"]
    TimeStamp = df["Timestamp"]
    file1.close()
    os.remove("temp_new.csv")


    Header = ["Actor Display Name","Project 
               Name","RepoName","RepoId","ActionId","Timestamp"]  
    d=[Disp_Name,pro_name,RepoName,RepoId,ActionId,TimeStamp]
    export_data = zip_longest(*d, fillvalue = '')
    with open(file_name, 'w',newline='') as myfile:
        wr = csv.writer(myfile)
        wr.writerow(Header)
        wr.writerows(export_data)
    myfile.close()
           

if __name__ == '__main__':

    parser = argparse.ArgumentParser("This is used for getting list of the projects")
    parser.add_argument("-o" , dest="org", help="org name")
    parser.add_argument("-p" , dest="pat", help="pat value")
    parser.add_argument("-sd" , dest="sdate", help="Start Date")
    parser.add_argument("-ed" , dest="edate", help="End Date")

    args = parser.parse_args()
    org  = args.org
    token = args.pat
    startdate = args.sdate
    enddate = args.edate
    
    url = "https://auditservice.dev.azure.com/{org_name}/_apis/audit/downloadlog?  
        format=csv&startTime={startdt}&endTime={enddt}&api-version=6.1- 
        preview.1".format(org_name=org,startdt=startdate,enddt=enddate)

    #call "getproject" function to check url and token to further create required csv
    getproject(url,token)
 
    FilterData()
megha
  • 621
  • 2
  • 11
  • 36

1 Answers1

2

[+] in your getproject function, you should use a try except block to handle http errors etc.

[+] if the csv file you're trying to download is quite large, it may be best to write the data in chunks.

As for the fields = next(reader) stopIteration errpr. I'm not sure. ¯_(ツ)_/¯ Try throwing your code in the debugger and stepping through it.

See: download large file in python with requests

def getproject(url,pat):
    try:
        # NOTE the stream=True parameter below
        with requests.get(url, auth=HTTPBasicAuth(username='',password=pat), stream=True) as r:
            r.raise_for_status()
            with open('tmp.csv', 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192): 
                    # If you have chunk encoded response uncomment if
                    # and set chunk_size parameter to None.
                    #if chunk: 
                    f.write(chunk)

    except requests.exceptions.ConnectionError as c_error:
        print(f"[-] Connection Error: {c_error}")
    except requests.exceptions.Timeout as t_error:
        print(f"[-] Connection Timeout Error: {t_error}")
    except requests.exceptions.RequestException as req_error:
        print(f"[-] Some Ambiguous Exception: {req_error}")


# This way seems faster based upon the comments of the link i shared
import requests
import shutil

def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename
Phil_Miller
  • 46
  • 1
  • 4