My work sample
- Data cleaning and normalizing
%%sql SELECT DISTINCT breed, TRIM(LEADING '-' FROM breed) AS breed_fixed FROM dogs ORDER BY breed_fixed
- DISTINCT, and ORDER BY in your own queries
%%sql select user_guid, membership_type, state, country, created_at from users where membership_type = 2 and state = "NC" and country = "us" and created_at > '2014-3-1' order by created_at desc;
- Summarizing Data
%%sql select gender, breed_group, count(distinct dog_guid) as num_dogs from dogs where breed_group !="" and breed_group is not null and breed_group !='none' group by 1, 2 having num_dogs >=1000 order by 3 desc
%%sql select test_name, avg(timestampdiff(hour, start_time, end_time)) as duration from exam_answers where timestampdiff(hour, start_time, end_time)<= 6000 group by 1 having avg(timestampdiff(second, start_time, end_time))>0 order by duration desc
%%sql select state, zip, count(distinct user_guid) as num_uguid from users where country ='us' group by state, zip having num_uguid>5 order by state asc, num_uguid desc
%%sql select test_name, step_type, start_time, end_time, timestampdiff(minute, start_time, end_time) as duration from exam_answers where timestampdiff(minute, start_time, end_time)<0 order by timestampdiff(minute, start_time, end_time) asc
- Subqueries_and_Derived_Tables
%%sql select count(distinct u.user_guid) from users u where not exists (select * from dogs d where u.user_guid = d.user_guid and d.user_guid is not null)
%%sql SELECT du.user_guid AS uUserID, d.user_guid AS dUserID FROM (select distinct user_guid from users) du LEFT JOIN dogs d ON du.user_guid=d.user_guid where du.user_guid is not null
%%sql SELECT DistinctUUsersID.user_guid AS uUserID, ddUserID.user_guid AS dUserID, count(*) AS numrows FROM (SELECT DISTINCT u.user_guid FROM users u limit 100 ) AS DistinctUUsersID LEFT JOIN (select distinct d.user_guid from dogs d ) as ddUserID ON DistinctUUsersID.user_guid=ddUserID.user_guid
GROUP BY DistinctUUsersID.user_guid ORDER BY numrows DESC;
%%sql
select g.dim, avg(g.nTest) as avg_test
from(select d.dog_guid as DogID, d.dimension as dim, count(c.test_name) as nTest
from dogs d join complete_tests c
on d.dog_guid = c.dog_guid
group by d.dog_guid, d.dimension) as g
group by g.dim
- Useful_Logical_Functions
%%sql
SELECT CASE cleaned_users.country
WHEN "US" THEN "In US"
WHEN "N/A" THEN "Not Applicable"
ELSE "Outside US"
END AS US_user,
count(cleaned_users.user_guid)
FROM (SELECT DISTINCT user_guid, country
FROM users
WHERE country IS NOT NULL) AS cleaned_users
GROUP BY US_user