I am writing a python function that runs a query against BigQuery tables and writes the result out to another table, here is my function with a small query:
def generate_user_pit(days_ago):
### Generate a user facts table for x days ago and save it into BigQuery
days_ago = days_ago
query = """
SELECT
Date,
sum(totals.visits)
FROM
`project.dataset.ga_sessions_20*` AS t
WHERE
parse_DATE('%y%m%d',
_table_suffix) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL @days_ago day)
AND DATE_SUB(CURRENT_DATE(), INTERVAL @days_ago day)
GROUP BY Date
"""
query_params = [
bigquery.ScalarQueryParameter('days_ago', 'INT64', days_ago),
]
job_config = bigquery.QueryJobConfig()
job_config.query_parameters = query_params
job_config.write_disposition = 'WRITE_TRUNCATE'
print("Generating dataframe...")
df = client.query(query, job_config=job_config).to_dataframe()
date = df['Date'].max()
table_name = "data_"
complete_table_name = table_name + date
dataset_ref = client.dataset('test')
table_ref = dataset_ref.table(complete_table_name)
print("Writing", complete_table_name, "to BigQuery...")
client.load_table_from_dataframe(df, table_ref).result()
print("Done!")
This works as expected, but when I run it with my actual query, which is much more memory intensive, I am hitting Memory Error
.
Is there anything I can do to improve this function and make it more efficient? For example, I don't need the query written to a dataframe, this is just the way I've been able to get it to function now.
Here is my query that is returning that is maxing out memory:
WITH Q1 AS(
SELECT
customDimension.value AS UserID,
# Visits from this individual
COUNT(DISTINCT CONCAT(CAST(fullVisitorId AS STRING),CAST(visitId AS STRING))) AS visits,
# Orders from this individual
COUNT(DISTINCT hits.transaction.transactionId) AS orders,
# COUNT(DISTINCT hits.transaction.transactionId)/COUNT(DISTINCT VisitId) AS frequency,
# Conversion rate of the individual
SAFE_DIVIDE(COUNT(DISTINCT hits.transaction.transactionId),
COUNT(DISTINCT CONCAT(CAST(fullVisitorId AS STRING),CAST(visitId AS STRING)))) AS conversion_rate,
# first visit date
MIN(Date) AS first_visit_date,
# last visit date
MAX(DATE) AS last_visit_date
FROM
`project.dataset.ga_sessions_20*` AS t
CROSS JOIN
UNNEST (hits) AS hits
CROSS JOIN
UNNEST(t.customdimensions) AS customDimension
CROSS JOIN
UNNEST(hits.product) AS hits_product
WHERE
parse_DATE('%y%m%d',
_table_suffix) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 999 day)
AND DATE_SUB(CURRENT_DATE(), INTERVAL @days_ago day)
AND customDimension.index = 2
AND customDimension.value NOT LIKE "true"
AND customDimension.value NOT LIKE "false"
AND customDimension.value NOT LIKE "undefined"
AND customDimension.value IS NOT NULL
GROUP BY
UserID
),
Q2 AS(
SELECT
customDimension.value AS UserID,
IFNULL(SUM(totals.bounces),0) AS bounces,
IFNULL(SUM(totals.transactionRevenue)/1000000,0) AS revenue
FROM
`project.dataset.ga_sessions_20*` AS t
CROSS JOIN
UNNEST(t.customdimensions) AS customDimension
WHERE parse_date('%y%m%d', _table_suffix) between
DATE_sub(current_date(), interval 999 day) and
DATE_sub(current_date(), interval @days_ago day)
AND customDimension.index = 2
AND customDimension.value NOT LIKE "true"
AND customDimension.value NOT LIKE "false"
AND customDimension.value NOT LIKE "undefined"
AND customDimension.value IS NOT NULL
GROUP BY
UserID
),
Q3 AS(
SELECT customDimension.value AS UserID, MAX(Date) AS last_order
FROM `project.dataset.ga_sessions_*` AS t
CROSS JOIN UNNEST(t.customdimensions) AS customDimension
WHERE totals.transactions > 0
AND customDimension.index = 2
GROUP BY UserID
),
Q4 AS(
SELECT customDimension.value AS UserID, MIN(Date) AS first_order
FROM `project.dataset.ga_sessions_*` AS t
CROSS JOIN UNNEST(t.customdimensions) AS customDimension
WHERE totals.transactions > 0
AND customDimension.index = 2
GROUP BY UserID
)
SELECT a.UserID AS UserID,
SUM(visits) AS visits,
IFNULL(SUM(orders),0) AS orders,
IFNULL(MIN(conversion_rate),0) AS conversion_rate,
IFNULL(SUM(bounces),0) AS bounces,
IFNULL(SUM(revenue),0) AS revenue,
IFNULL(SAFE_DIVIDE(SUM(revenue),SUM(orders)),0) AS AOV,
IFNULL(SAFE_DIVIDE(SUM(revenue),SUM(visits)),0) AS rev_per_visit,
IFNULL(SAFE_DIVIDE(SUM(bounces),SUM(visits)),0) AS bounce_rat
FROM Q1 AS a
LEFT JOIN Q2 AS b
USING (UserID)
LEFT JOIN Q4 AS f
USING (UserID)
LEFT JOIN Q3 AS l
USING (UserID)
GROUP BY UserID