I have a csv file containing around 8 Million records, but it is taking more than an hour to complete the process, so please could you please help me with this?
Note: There is no issue with the python code; it works very well without any errors. The only problem is that is taking too much time to load and process the 8M records.
Here is the code
import pandas as pd
import numpy as np
import ipaddress
from pathlib import Path
import shutil
import os
from time import time
start = time()
inc_path = 'C:/Users/phani/OneDrive/Desktop/pandas/inc'
arc_path = 'C:/Users/phani/OneDrive/Desktop/pandas/arc'
dropZone_path = 'C:/Users/phani/OneDrive/Desktop/pandas/dropZone'
for src_file in Path(dropZone_path).glob('XYZ*.csv*'):
process_file = shutil.copy(os.path.join(dropZone_path, src_file), arc_path)
for sem_file in Path(dropZone_path).glob('XYZ*.sem'):
semaphore_file = shutil.copy(os.path.join(dropZone_path, sem_file), inc_path)
# rename the original file
for file in os.listdir(dropZone_path):
file_path = os.path.join(dropZone_path, file)
shutil.copy(file_path, os.path.join(arc_path, "Original_" + file))
for sema_file in
Path(arc_path).glob('Original_XYZ*.sem*'):
os.remove(sema_file)
## Read CSVfile from TEMP folder
df = pd.read_csv(process_file)
df.sort_values(["START_IP_ADDRESS"], ascending=True,)
i = 0
while i < len(df) - 1:
i += 1
line = df.iloc[i:i + 1].copy(deep=True)
curr_START_IP_NUMBER = line.START_IP_NUMBER.values[0]
curr_END_IP_NUMBER = line.END_IP_NUMBER
prev_START_IP_NUMBER = df.loc[i - 1, 'START_IP_NUMBER']
prev_END_IP_NUMBER = df.loc[i - 1, 'END_IP_NUMBER']
# if no gap - continue
if curr_START_IP_NUMBER == (prev_END_IP_NUMBER + 1):
continue
# else fill the gap
# new line start ip number
line.START_IP_NUMBER = prev_END_IP_NUMBER + 1
line.START_IP_ADDRESS = (ipaddress.ip_address(int(line.START_IP_NUMBER)))
# new line end ip number
line.END_IP_NUMBER = curr_START_IP_NUMBER - 1
line.END_IP_ADDRESS = (ipaddress.ip_address(int(line.END_IP_NUMBER)))
line.COUNTRY_CODE = ''
line.LATITUDE_COORDINATE = ''
line.LONGITUDE_COORDINATE = ''
line.ISP_NAME = ''
line.AREA_CODE = ''
line.CITY_NAME = ''
line.METRO_CODE = ''
line.ORGANIZATION_NAME = ''
line.ZIP_CODE = ''
line.REGION_CODE = ''
# insert the line between curr index to previous index
df = pd.concat([df.iloc[:i], line, df.iloc[i:]]).reset_index(drop=True)
df.to_csv(process_file, index=False)
for process_file in Path(arc_path).glob('XYZ*.csv*'):
EREFile_CSV = shutil.copy(os.path.join(arc_path, process_file), inc_path)