Pulling three csv/xls files into dataframes causes one of them to not be found

Question

Excuse the title. This is just a very strange error I am getting. When I try to pull two xls files and one csv file into a dataframe I get this error:

FileNotFoundError: [Errno 2] No such file or directory: '.\\pa-dirty-price-crawler\\goldman_folder\\Trade_Detail_GSCO_GREAT_AJAX_OPERATING_PARTNERSHIP_L.P._COB08_Apr_2020_19_1586111993954.xls'

That error occurs when I do this:

nomura = get_nomura_df_by_date(start_date)
jpm = get_jpm_df_by_date(start_date)
gs = get_goldman_df_by_date(start_date)

Now if I comment out either nomura or jpm then there is no error at all. In other words if I do this

# nomura = get_nomura_df_by_date(start_date)
jpm = get_jpm_df_by_date(start_date)
gs = get_goldman_df_by_date(start_date)

Then I receive no error at all which is weird. The xls and csv files are all in separate folders in my workspace, i.e., I have one folder for goldman, one for nomura, and one for jpm.

Here is the entire code:

import win32com.client
import os, zipfile
import pandas as pd
import pyodbc
import sql_utils as sql
import datetime as dt
import time
import xlrd

output_file = r"C:\Users\morgan.weiss\workspace\pa-dirty-price-crawler\output\All_Bond_Repos.xlsx"


def get_jpm_zip():
    Outlook = win32com.client.Dispatch("Outlook.Application")
    olNs = Outlook.GetNamespace("MAPI")
    Inbox = olNs.GetDefaultFolder(6)

    Filter = ("@SQL=" + chr(34) + "urn:schemas:httpmail:subject" +
            chr(34) + " Like '%JPMS Statement%' AND " +
            chr(34) + "urn:schemas:httpmail:hasattachment" +
            chr(34) + "=1")

    Items = Inbox.Items.Restrict(Filter)

    dates = []
    for a in Items:
        dates.append(a.senton.date())

    for Item in Items:
        for attachment in Item.Attachments:
            attachment.SaveAsFile(r"C:\\Users\\morgan.weiss\\workspace\\pa-dirty-price-crawler\\jpm_folder\\" +  attachment.FileName)
    return dates


def get_nomura_csv():
    Outlook = win32com.client.Dispatch("Outlook.Application")
    olNs = Outlook.GetNamespace("MAPI")
    Inbox = olNs.GetDefaultFolder(6)

    Filter = ("@SQL=" + chr(34) + "urn:schemas:httpmail:subject" +
            chr(34) + " Like '%Nomura (NSI) Repo%' AND " +
            chr(34) + "urn:schemas:httpmail:hasattachment" +
            chr(34) + "=1")

    Items = Inbox.Items.Restrict(Filter)

    dates = []
    for a in Items:
        dates.append(a.senton.date())

    dates_attachment_map = {}
    i = 0
    for Item in Items:
        for attachment in Item.Attachments:
            dates_attachment_map[dates[i]] = attachment.FileName
            i+=1
            attachment.SaveAsFile(r"C:\\Users\\morgan.weiss\\workspace\\pa-dirty-price-crawler\\nomura_folder\\" +  attachment.FileName)
    return dates_attachment_map

def get_goldman_csv():
    Outlook = win32com.client.Dispatch("Outlook.Application")
    olNs = Outlook.GetNamespace("MAPI")
    Inbox = olNs.GetDefaultFolder(6)

    Filter = ("@SQL=" + chr(34) + "urn:schemas:httpmail:subject" +
            chr(34) + " Like '%Repo Margin Summary from GOLDMAN SACHS%' AND " +
            chr(34) + "urn:schemas:httpmail:hasattachment" +
            chr(34) + "=1")

    Items = Inbox.Items.Restrict(Filter)

    dates = []
    for a in Items:
        dates.append(a.senton.date())

    dates_attachment_map = {}
    i = 0
    for Item in Items:
        for attachment in Item.Attachments:
            if 'Trade_Detail_GSCO_GREAT_AJAX' in attachment.FileName:
                dates_attachment_map[dates[i]] = attachment.FileName
                i+=1
                attachment.SaveAsFile(r"C:\\Users\\morgan.weiss\\workspace\\pa-dirty-price-crawler\\goldman_folder\\" +  attachment.FileName)
    return dates_attachment_map

def unzip_jpm_files():
    jpm = get_jpm_zip()
    dir_name = r'C:\Users\morgan.weiss\workspace\pa-dirty-price-crawler\jpm_folder'
    extension = ".zip"
    os.chdir(dir_name) # change directory from working dir to dir with files


    jpm_map = {}
    i = 0

    for item in os.listdir(dir_name): # loop through items in dir
        if item.endswith(extension): # check for ".zip" extension
            file_name = os.path.abspath(item) # get full path of files
            zip_ref = zipfile.ZipFile(file_name) # create zipfile object
            zip_ref.extractall(dir_name, pwd=b'qpsqpwsr') # extract file to dir
            zip_ref.close() # close file
            os.remove(file_name) # delete zipped file

    for item in os.listdir(dir_name):
        if item.endswith(".xls"):
            file_name = os.path.abspath(item)
            jpm_map[jpm[i]] = file_name
            i+=1
    return jpm_map

def get_jpm_data_frames(path_name):
    wb = xlrd.open_workbook(path_name, logfile=open(os.devnull, 'w'))
    df = pd.read_excel(wb,skiprows=4,engine='xlrd')
    return df

def get_nomura_data_frames(file_name):
    dir_name = '.\\pa-dirty-price-crawler\\nomura_folder'
    path_name = os.path.join(dir_name, file_name)
    df = pd.read_csv(path_name)
    return df

def get_gs_data_frames(file_name):
    dir_name = '.\\pa-dirty-price-crawler\\goldman_folder'
    path_name = os.path.join(dir_name, file_name)
    print(os.path.isfile(path_name))
    df = pd.read_excel(path_name,skiprows=9, nrows=12)
    return df

def get_data_from_sql():
    cnxn = pyodbc.connect(sql.connection_string)
    df = pd.read_sql(sql.get_bond_repos,con=cnxn)
    return df

def compare_dates(mail_date, date_time):
    return mail_date.year == date_time.year and mail_date.month == date_time.month and mail_date.day == date_time.day

def get_jpm_df_by_date(date):
    jpm_map = unzip_jpm_files()
    time.sleep(10)
    jpm_df = pd.DataFrame()
    jpm_df = get_jpm_data_frames(jpm_map[date])
    return jpm_df

def get_nomura_df_by_date(date):
    nomura_map = get_nomura_csv()
    time.sleep(10)
    nomura_df = pd.DataFrame()
    nomura_df = get_nomura_data_frames(nomura_map[date])
    return nomura_df

def get_goldman_df_by_date(date):
    goldman_map = get_goldman_csv()
    time.sleep(10)
    goldman_df = pd.DataFrame()
    goldman_df = get_gs_data_frames(goldman_map[date])
    return goldman_df

def edit_isin(df):
    df['ISIN'] = df['ISIN'].apply(lambda x: x[2:])
    df['ISIN'] = df['ISIN'].apply(lambda x: x[:-1])
    return df



x = '2020-04-09'
start_date = dt.datetime.strptime(x, "%Y-%m-%d")
start_date = start_date.date()

df = get_data_from_sql()
# nomura = get_nomura_df_by_date(start_date)
gs = get_goldman_df_by_date(start_date)
jpm = get_jpm_df_by_date(start_date)



# gs = edit_isin(gs)
# df = df.set_index('Cusip').join(gs.set_index('ISIN'))
# print(df.head())


# df.to_excel(output_file, index=False)

Note that I took out my name and put "usr_name" in the post. I am baffled by this error and I have no idea why it's breaking.

EDIT:

I am starting to identify the issue. When I just call the function to get the zip folders then the code runs fine, the issue occurs when I unzip the folders when I am trying to get all the dataframes. Perhaps that may add some details that you guys could point out for a solution.

so your saying it works then it doesn't work? it might be a caching issue, especially if you're working with servers. — Umar.H, Apr 16 '20 at 18:51
@Datanovice I am not using servers. What I am trying to illustrate is that when I try to load all three dataframes I get an error. But if I just upload 2 then its fine, does that make sense? — Snorrlaxxx, Apr 16 '20 at 19:07
Does file `'.\\pa-dirty-price-crawler\\goldman_folder\\Trade_Detail_GSCO_GREAT_AJAX_OPERATING_PARTNERSHIP_L.P._COB08_Apr_2020_19_1586111993954.xls'` exist in your system? — Ehsan, Apr 16 '20 at 19:18
Then it's just a file not found error? The file doesn't exist or the path is wrong — Umar.H, Apr 16 '20 at 19:18
@Datanovice The file exists and again if comment out the nomura part then all of a sudden the file exists and I have no problems. — Snorrlaxxx, Apr 16 '20 at 19:25
@Snorrlaxxx then I think it's less of a pandas/python issue and rather a windows issue. I had the same issue when working primarly on windows and with pandas. — Umar.H, Apr 16 '20 at 19:27
@Datanovice Not surprising it is a Windows problem, is there a way to fix it then? — Snorrlaxxx, Apr 16 '20 at 19:41
@trigonom I tried time.sleep(5) but got the same error, I will try 10 but I am guessing it will be the same problem. — Snorrlaxxx, Apr 17 '20 at 19:45
@Snorrlaxxx replace ` \\ ` with just ` \ ` and run the code again lets see. just like in ` r'C:\Users\usr_name\workspace\pa-dirty-price-crawler\jpm_folder' ` — Laenka-Oss, Apr 20 '20 at 16:10
@Laenka-Oss Sorry I do not understand what you want me to change can you clarify? — Snorrlaxxx, Apr 20 '20 at 16:12
Does `os.path.isfile(path_name)` in `get_gs_data_frames()` return True or False (prior to calling `pd.read_excel()`)? — James Daily, Apr 20 '20 at 16:31
@JamesDaily Edited my code above with what you asked and yes the output is True. — Snorrlaxxx, Apr 20 '20 at 16:40
@Laenka-Oss You want me to make that replacement for all? I believe if I make all those changes there is going to be some errors. — Snorrlaxxx, Apr 20 '20 at 16:42
@Laenka-Oss Making your changes I get this error: File "c:/Users/usr_name/workspace/pa-dirty-price-crawler/src/outlook.py", line 31 attachment.SaveAsFile(r"C:\Users\usr_name\workspace\pa-dirty-price-crawler\jpm_folder\" + attachment.FileName) ^ SyntaxError: EOL while scanning string literal — Snorrlaxxx, Apr 20 '20 at 16:45
yes@Snorrlaxx and then as @JamesDaily pointed, my idea will not even help if os.path.exists(path_names) are all True. — Laenka-Oss, Apr 20 '20 at 16:46
Hard to tell given can't see the data. Could you post the full stacktrace? A few things stand out but fall short of being smoking guns: 1) mix of relative and absolute filepaths (make all absolute) 2.) while windows can handle bad filenames, the fact that error occurs on `L.P._CO` is curious. consider [slugifying the names](https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename) before saving. 3.) isolate your download step from import step in testing at least. 4.) Trying refactoring your close-to-identical functions into one to avoid small errors cropping up. — webelo, Apr 25 '20 at 23:04

score 2 · Accepted Answer · answered Apr 20 '20 at 16:49

I fixed my code the weird fix is simply getting the dataframes in this order:

nomura = get_nomura_df_by_date(start_date)
gs = get_goldman_df_by_date(start_date)
jpm = get_jpm_df_by_date(start_date)

Then magically there is no errors...why? I have no idea...

score 0 · Answer 2 · answered Apr 25 '20 at 22:24

0

What is the purpose of the time.sleep() calls? My guess is whatever reason you put them there is the reason it's failing

answered Apr 25 '20 at 22:24

Rashan Arshad

247
3
11

Pulling three csv/xls files into dataframes causes one of them to not be found

2 Answers2