This is a follow up to this SO question
NumPy "record array" or "structured array" or "recarray"
I am unable to figure out which one is the best for my situation.
For my data, one column is an int, and the other column is a variable length (2-150) batch of ints.
Below is code which downloads a small piece (10 mbs) of data and opens it in Pandas
import requests
import pickle
import numpy as np
import pandas as pd
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
download_file_from_google_drive('1-0R28Yhdrq2QWQ-4MXHIZUdZG2WZK2qR', 'sample.pkl')
sampleDF = pd.read_pickle('sample.pkl')
sampleDF['totalCites2'] = sampleDF['totalCites2'].apply(lambda x: np.array(x))
Here is a notebook so the user doesn't have to download anything onto their system
https://colab.research.google.com/drive/1kaaYk5_xbzQcXTr_DhjuWQT_3S4E-rML