I need to know how I can via python read csv data from cloud storage to the big query but without using the comma as a delimiter. Ideally, I should recognize the delimiter automatically or I can declare the variable that will separate the fields from the table. The comma in my process as a delimiter is a problem.
After many attempts using the ";" as a delimiter, I noticed that my ingestion process does not recognize the character as a separator.
#Libraries
import csv
import pyodbc
import logging
import os
import cloudstorage as gcs
import gcloud
from gcloud import storage
from google.cloud import bigquery
import pandas as pd
db = pyodbc.connect("DRIVER={SQL Server};server=10.0.1.1;database=blabla;uid=test;pwd=xxx")
cursor = db.cursor()
SQLview = 'select * from test'
cursor.execute(SQLview)
with open('test_google2.csv', 'w', newline= '') as f:
writer = csv.writer(f, delimiter=',')
writer.writerow([ i[0] for i in cursor.description ])
writer.writerows(cursor.fetchall())
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import os
from google.cloud import bigquery
import json
try:
script_path = os.path.dirname(os.path.abspath(__file__)) + "/"
except:
script_path = "C:\\Users\\user1\\auth.json"
#Bigquery Credentials and settings
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = script_path
client = bigquery.Client(project='big-data')
dataset_id = 'dataset_tst'
dataset_ref = client.dataset('dataset_tst')
credentials = GoogleCredentials.get_application_default()
service = discovery.build('storage', 'v1', credentials=credentials)
filename = 'C:\\Users\\user1\\Documents\\test_google2.csv'
bucket = 'big-data-bkt'
body = {'name': 'test_google2.csv'}
req = service.objects().insert(bucket=bucket, body=body, media_body=filename)
resp = req.execute()
#CLOUD STORAGE >>> BIG QUERY
from gcloud import storage
from google.cloud import bigquery
from gcloud import bigquery as bq1
bucket_uri = 'bucket_id'
bucket_name = 'bucket_name'
bucket_target = 'test_google2.csv'
local_dataset = 'test_google2.csv'
bucket_target_uri = bucket_uri + bucket_target
bigquery_dataset = 'dataset_tst'
bigquery_table = 'test'
client1 = bq1.Client(project='big-data')
dataset_ref1 = client1.dataset(bigquery_dataset)
dataset_ref1.create() # API request
def upload_blob(bucket_name, source_file_name, destination_blob_name):
"""Upload a CSV to Google Cloud Storage.
1. Retrieve the target bucket.
2. Set destination of data to be uploaded.
3. Upload local CSV.
"""
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
# Commence Upload
blob.upload_from_filename(source_file_name)
print('File {} uploaded to {}.'.format(
source_file_name,
destination_blob_name))
def insert_bigquery(target_uri, dataset_id, table_id):
"""Insert CSV from Google Storage to BigQuery Table.
1. Specify target dataset within BigQuery.
2. Create a Job configuration.
3. Specify that we are autodetecting datatypes.
4. Reserve row #1 for headers.
5. Specify the source format of the file (defaults to CSV).
6. Pass the URI of the data storage on Google Cloud Storage from.
7. Load BigQuery Job.
8. Execute BigQuery Job.
"""
bigquery_client = bigquery.Client()
dataset_ref = bigquery_client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
uri = target_uri
load_job = bigquery_client.load_table_from_uri(uri,dataset_ref.table(table_id), job_config=job_config) # API request
print('Starting job {}'.format(load_job.job_id))
# Waits for table load to complete.
load_job.result()
print('Job finished.')
upload_blob(bucket_name, local_dataset, bucket_target)
insert_bigquery(bucket_target_uri, bigquery_dataset, bigquery_table)