I'm writing a UDF to process Google Analytics data, and getting the "UDF out of memory" error message when I try to process multiple rows. I downloaded the raw data and found the largest record and tried running my UDF query on that, with success. Some of the rows have up to 500 nested hits, and the size of the hit record (by far the largest component of each row of the raw GA data) does seem to have an effect on how many rows I can process before getting the error.
For example, the query
select
user.ga_user_id,
ga_session_id,
...
from
temp_ga_processing(
select
fullVisitorId,
visitNumber,
...
from [79689075.ga_sessions_20160201] limit 100)
returns the error, but
from [79689075.ga_sessions_20160201] where totals.hits = 500 limit 1)
does not.
I was under the impression that any memory limitations were per-row? I've tried several techniques, such as setting row = null;
before emit(return_dict);
(where return_dict is the processed data) but to no avail.
The UDF itself doesn't do anything fancy; I'd paste it here but it's ~45 kB in length. It essentially does a bunch of things along the lines of:
function temp_ga_processing(row, emit) {
topic_id = -1;
hit_numbers = [];
first_page_load_hits = [];
return_dict = {};
return_dict["user"] = {};
return_dict["user"]["ga_user_id"] = row.fullVisitorId;
return_dict["ga_session_id"] = row.fullVisitorId.concat("-".concat(row.visitNumber));
for(i=0;i<row.hits.length;i++) {
hit_dict = {};
hit_dict["page"] = {};
hit_dict["time"] = row.hits[i].time;
hit_dict["type"] = row.hits[i].type;
hit_dict["page"]["engaged_10s"] = false;
hit_dict["page"]["engaged_30s"] = false;
hit_dict["page"]["engaged_60s"] = false;
add_hit = true;
for(j=0;j<row.hits[i].customMetrics.length;j++) {
if(row.hits[i].customDimensions[j] != null) {
if(row.hits[i].customMetrics[j]["index"] == 3) {
metrics = {"video_play_time": row.hits[i].customMetrics[j]["value"]};
hit_dict["metrics"] = metrics;
metrics = null;
row.hits[i].customDimensions[j] = null;
}
}
}
hit_dict["topic"] = {};
hit_dict["doctor"] = {};
hit_dict["doctor_location"] = {};
hit_dict["content"] = {};
if(row.hits[i].customDimensions != null) {
for(j=0;j<row.hits[i].customDimensions.length;j++) {
if(row.hits[i].customDimensions[j] != null) {
if(row.hits[i].customDimensions[j]["index"] == 1) {
hit_dict["topic"] = {"name": row.hits[i].customDimensions[j]["value"]};
row.hits[i].customDimensions[j] = null;
continue;
}
if(row.hits[i].customDimensions[j]["index"] == 3) {
if(row.hits[i].customDimensions[j]["value"].search("doctor") > -1) {
return_dict["logged_in_as_doctor"] = true;
}
}
// and so on...
}
}
}
if(row.hits[i]["eventInfo"]["eventCategory"] == "page load time" && row.hits[i]["eventInfo"]["eventLabel"].search("OUTLIER") == -1) {
elre = /(?:onLoad|pl|page):(\d+)/.exec(row.hits[i]["eventInfo"]["eventLabel"]);
if(elre != null) {
if(parseInt(elre[0].split(":")[1]) <= 60000) {
first_page_load_hits.push(parseFloat(row.hits[i].hitNumber));
if(hit_dict["page"]["page_load"] == null) {
hit_dict["page"]["page_load"] = {};
}
hit_dict["page"]["page_load"]["sample"] = 1;
page_load_time_re = /(?:onLoad|pl|page):(\d+)/.exec(row.hits[i]["eventInfo"]["eventLabel"]);
if(page_load_time_re != null) {
hit_dict["page"]["page_load"]["page_load_time"] = parseFloat(page_load_time_re[0].split(':')[1])/1000;
}
}
// and so on...
}
}
row = null;
emit return_dict;
}
The job ID is realself-main:bquijob_4c30bd3d_152fbfcd7fd