0

I need to know how I can via python read csv data from cloud storage to the big query but without using the comma as a delimiter. Ideally, I should recognize the delimiter automatically or I can declare the variable that will separate the fields from the table. The comma in my process as a delimiter is a problem.

After many attempts using the ";" as a delimiter, I noticed that my ingestion process does not recognize the character as a separator.

#Libraries
import csv 
import pyodbc
import logging
import os
import cloudstorage as gcs
import gcloud
from gcloud import storage
from google.cloud import bigquery
import pandas as pd

db = pyodbc.connect("DRIVER={SQL Server};server=10.0.1.1;database=blabla;uid=test;pwd=xxx")
cursor = db.cursor()
SQLview = 'select * from test'
cursor.execute(SQLview)
with open('test_google2.csv', 'w', newline= '') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow([ i[0] for i in cursor.description ])
    writer.writerows(cursor.fetchall())    

from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import os
from google.cloud import bigquery
import json


try:
    script_path = os.path.dirname(os.path.abspath(__file__)) + "/"
except:
    script_path = "C:\\Users\\user1\\auth.json"

#Bigquery Credentials and settings
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = script_path 

client = bigquery.Client(project='big-data')
dataset_id = 'dataset_tst'
dataset_ref = client.dataset('dataset_tst')    

credentials = GoogleCredentials.get_application_default()
service = discovery.build('storage', 'v1', credentials=credentials)

filename = 'C:\\Users\\user1\\Documents\\test_google2.csv'
bucket = 'big-data-bkt'

body = {'name': 'test_google2.csv'}
req = service.objects().insert(bucket=bucket, body=body, media_body=filename)
resp = req.execute()


#CLOUD STORAGE >>> BIG QUERY
from gcloud import storage
from google.cloud import bigquery
from gcloud import bigquery as bq1

bucket_uri = 'bucket_id'
bucket_name = 'bucket_name'
bucket_target = 'test_google2.csv'
local_dataset = 'test_google2.csv'
bucket_target_uri = bucket_uri + bucket_target
bigquery_dataset = 'dataset_tst'
bigquery_table = 'test'

client1 = bq1.Client(project='big-data')
dataset_ref1 = client1.dataset(bigquery_dataset)
dataset_ref1.create()  # API request

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Upload a CSV to Google Cloud Storage.

    1. Retrieve the target bucket.
    2. Set destination of data to be uploaded.
    3. Upload local CSV.
    """
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    # Commence Upload
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name))


def insert_bigquery(target_uri, dataset_id, table_id):
    """Insert CSV from Google Storage to BigQuery Table.

    1. Specify target dataset within BigQuery.
    2. Create a Job configuration.
    3. Specify that we are autodetecting datatypes.
    4. Reserve row #1 for headers.
    5. Specify the source format of the file (defaults to CSV).
    6. Pass the URI of the data storage on Google Cloud Storage from.
    7. Load BigQuery Job.
    8. Execute BigQuery Job.
    """
    bigquery_client = bigquery.Client()
    dataset_ref = bigquery_client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = True
    job_config.skip_leading_rows = 1
    job_config.source_format = bigquery.SourceFormat.CSV
    uri = target_uri
    load_job = bigquery_client.load_table_from_uri(uri,dataset_ref.table(table_id), job_config=job_config)  # API request
    print('Starting job {}'.format(load_job.job_id))
    # Waits for table load to complete.
    load_job.result()
    print('Job finished.')


upload_blob(bucket_name, local_dataset, bucket_target)
insert_bigquery(bucket_target_uri, bigquery_dataset, bigquery_table)
eyllanesc
  • 235,170
  • 19
  • 170
  • 241
Felipe FB
  • 1,212
  • 6
  • 22
  • 55

1 Answers1

4

Alright, i took time to reproduce this issue myself so i can provide you with the proper answer, and here's what i found.

If you want to set another delimiter aside of the default "," when loading data into BigQuery, you need to specify this on your job_config:

job_config.field_delimiter = ";"

That's all. You can know more about the options and different delimiters you can use reading here at the documentation.

My final code is this (since I'm using Cloud Console Shell I skip a few credential configs).

# Libraries
import csv 
import logging
import os
# import cloudstorage as gcs
import gcloud
from gcloud import storage
from google.cloud import bigquery
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json
import mysql.connector

# connecting to the DB 
cnx = mysql.connector.connect(user="user", password="pass", host="11.111.111.11", database="test")
cursor = cnx.cursor()
SQLview = 'select * from test'
filename = 'test_google2.csv'
folder = "folder_path_to_file"

# Creating CVS file
cursor.execute(SQLview)
with open(filename, 'w', newline= '') as f:
    writer = csv.writer(f, delimiter=';')
    writer.writerow([ i[0] for i in cursor.description ])
    writer.writerows(cursor.fetchall())    


# uploading it into a bucket
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    storage_client = storage.Client(project="project_name")
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}'.format(
        source_file_name,
        destination_blob_name
    ))


# inserting the csv from Cloud Storage into BigQuery
def insert_bigquery(target_uri, dataset_id, table_id):
    bigquery_client = bigquery.Client(project="project_name")
    dataset_ref = bigquery_client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = True
    job_config.skip_leading_rows = 1
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.field_delimiter = ";"
    uri = target_uri
    load_job = bigquery_client.load_table_from_uri(
        uri,
        dataset_ref.table(table_id),
        job_config=job_config
        )
    print('Starting job {}'.format(load_job.job_id))
    load_job.result()
    print('Job finished.')



upload_blob("bucket_name", folder + filename, filename)
insert_bigquery("gs://bucket_name/"+filename, "dataset_id", "table_id")
Mayeru
  • 1,044
  • 7
  • 12
  • Tks Mayeru, solve my problem! =) I have a curiosity: is it possible to create the csv file directly in Cloud Storage without saving the csv on the local machine? – Felipe FB May 21 '19 at 13:06
  • If you are using "Cloud SQL" you are able export your data as a csv file directly into your bucket on Cloud Storage [1]. https://cloud.google.com/sql/docs/mysql/import-export/exporting#csv – Mayeru May 21 '19 at 14:50
  • I am using SQL Server that is on server outside the cloud and passing the information to Cloud Storage and Big Query. If so, do you see any alternatives? – Felipe FB May 21 '19 at 16:44
  • transferring it directly without saving it first on your local? Sadly, not that i know of. There's other ways to export your data into CSV using SQL Server, though [1]. [1] https://stackoverflow.com/questions/14212641/export-table-from-database-to-csv-file – Mayeru May 22 '19 at 07:22