0

I have file with data avito_trend.csv and I want to print barchart of

import pandas as pd
import itertools

df = pd.read_csv("avito_trend.csv", parse_dates=[2])

def f(df):
    dfs = []
    for x in [list(x) for x in itertools.combinations(df['address'].unique(), 2)]:    
        c1 = df.loc[df['address'].isin([x[0]]), 'ID']
        c2 = df.loc[df['address'].isin([x[1]]), 'ID']
        c = pd.Series(list(set(c1).intersection(set(c2))))
        dfs.append(pd.DataFrame({'a':len(c), 'b':' and '.join(x)}, index=[0]))
    return pd.concat(dfs)

result = df.groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index()
print result


    used_at     a                                    b
0      2014  1364                   avito.ru and e1.ru
1      2014  1716                 avito.ru and drom.ru
2      2014  1602                 avito.ru and auto.ru
3      2014   299           avito.ru and avtomarket.ru
4      2014   579                   avito.ru and am.ru
5      2014   602             avito.ru and irr.ru/cars
6      2014   424       avito.ru and cars.mail.ru/sale
7      2014   634                    e1.ru and drom.ru
8      2014   475                    e1.ru and auto.ru
9      2014   139              e1.ru and avtomarket.ru
10     2014   224                      e1.ru and am.ru
11     2014   235                e1.ru and irr.ru/cars
12     2014   154          e1.ru and cars.mail.ru/sale
13     2014   874                  drom.ru and auto.ru
14     2014   247            drom.ru and avtomarket.ru
15     2014   394                    drom.ru and am.ru
16     2014   423              drom.ru and irr.ru/cars
17     2014   292        drom.ru and cars.mail.ru/sale
18     2014   243            auto.ru and avtomarket.ru
19     2014   408                    auto.ru and am.ru
20     2014   409              auto.ru and irr.ru/cars
21     2014   330        auto.ru and cars.mail.ru/sale
22     2014   133              avtomarket.ru and am.ru
23     2014   139        avtomarket.ru and irr.ru/cars
24     2014   105  avtomarket.ru and cars.mail.ru/sale
25     2014   223                am.ru and irr.ru/cars
26     2014   166          am.ru and cars.mail.ru/sale
27     2014   197    irr.ru/cars and cars.mail.ru/sale
28     2015  1153                   avito.ru and e1.ru
29     2015  1473                 avito.ru and auto.ru
30     2015  1491                 avito.ru and drom.ru
31     2015   403             avito.ru and irr.ru/cars
32     2015   205           avito.ru and avtomarket.ru
33     2015   256       avito.ru and cars.mail.ru/sale
34     2015   262                   avito.ru and am.ru
35     2015   451                    e1.ru and auto.ru
36     2015   539                    e1.ru and drom.ru
37     2015   148                e1.ru and irr.ru/cars
38     2015   105              e1.ru and avtomarket.ru
39     2015   105          e1.ru and cars.mail.ru/sale
40     2015    99                      e1.ru and am.ru
41     2015   799                  auto.ru and drom.ru
42     2015   288              auto.ru and irr.ru/cars
43     2015   162            auto.ru and avtomarket.ru
44     2015   195        auto.ru and cars.mail.ru/sale
45     2015   224                    auto.ru and am.ru
46     2015   277              drom.ru and irr.ru/cars
47     2015   175            drom.ru and avtomarket.ru
48     2015   189        drom.ru and cars.mail.ru/sale
49     2015   187                    drom.ru and am.ru
50     2015    73        irr.ru/cars and avtomarket.ru
51     2015    94    irr.ru/cars and cars.mail.ru/sale
52     2015   102                irr.ru/cars and am.ru
53     2015    48  avtomarket.ru and cars.mail.ru/sale
54     2015    72              avtomarket.ru and am.ru
55     2015    73          cars.mail.ru/sale and am.ru

I use ax = result.plot(width=0.5, kind='barh', stacked=True) and I want to get smth like this graph. How can I do it? I need that meaning to 2014 and 2015 years be in one strind for other pair of site. And instead percent I need mean of column a

NineWasps
  • 2,081
  • 8
  • 28
  • 45

2 Answers2

1

As @user308827 has already said, I would also use seaborn for that, but i would do it bit differently:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.expand_frame_repr', False)

cols = ['ID', 'address', 'used_at']
df = pd.read_csv(r'D:\data\gDrive\data\.stack.overflow\data\avito_trend.csv.gz',
                 parse_dates=['used_at'], usecols=cols)

df.sort_values(['ID','used_at','address'], inplace=True)

df['prev_address'] = df['address'].shift()
df['time_diff'] = df['used_at'] - df['used_at'].shift()

df = df[df['address'] != df['prev_address']]
df = df[df['time_diff'] <= pd.Timedelta('10min')]


tmp = df[['ID','address','prev_address']] \
      .groupby(['address','prev_address', df.used_at.dt.year]) \
      .count() \
      .reset_index()

# remove `df` from memory
del df

tmp['visit_from'] = tmp['prev_address'] + ' -> ' + tmp['address']

# keep only 'interesting' columns
tmp = tmp[['visit_from','used_at','ID']]
tmp.columns = ['visit_from','year','visits']

# save temporary groupped CSV file
#fn = r'D:\data\gDrive\data\.stack.overflow\data\avito_grp.csv'
#tmp.to_csv(fn, index=False)

# show all
#df = tmp

# show only those sites with visits >= 100 (within both years)
df = tmp[tmp.groupby('visit_from')['visits'].transform('sum') >= 100].reset_index()

# prepare sorted index
idx = df.groupby('visit_from')['visits'].transform('sum').sort_values(ascending=False).index

# 'apply' index
df = df.reindex(idx)

# add 'total' column (sum of visits for all years)
#df['total'] = df.groupby('visit_from')['visits'].transform('sum')

################################################
#
# SeaBorn plotting
#
sns.set(style="darkgrid")
sns.set_color_codes("pastel")

f, ax = plt.subplots(figsize=(16, 12))
ax = sns.barplot(x='visits', y='visit_from', hue='year', data=df, saturation=0.8)
plt.xlabel('Visits')

# add annotations
[ax.text(p.get_width() + 3, p.get_y() + p.get_height(),
         int(p.get_width()), fontsize=8)
 for p in ax.patches]


plt.show()

PS the interesting for you part begins with SeaBorn plotting comment

Z@eB@vsy@

MaxU - stand with Ukraine
  • 205,989
  • 36
  • 386
  • 419
  • Is it real to add means like that picture? [pic](https://silvrback.s3.amazonaws.com/uploads/ae516a51-06ae-400d-a93d-5c2ba3d3590f/delays_large.png) – NineWasps Mar 28 '16 at 11:17
  • And How can do space between every pair? Column to `el.ru -> avito.ru` to `2014` and `2015` and next add a space? and this for every pair – NineWasps Mar 28 '16 at 11:21
  • @ldevyataykina, do you want to add number of visits at the end of each bar or percentage? – MaxU - stand with Ukraine Mar 28 '16 at 11:29
  • There are not all pairs? – NineWasps Mar 28 '16 at 11:40
  • look for this comment in my answer: `show only those sites with visits >= 100 (within both years)` – MaxU - stand with Ukraine Mar 28 '16 at 11:51
  • Ok, Can you make the font of values slightly smaller? – NineWasps Mar 28 '16 at 11:53
  • I do not want you to strain , but can you help me with another problem? [link](http://stackoverflow.com/questions/36252018/different-sequence-of-names-with-pandas/36252402?noredirect=1#comment60151422_36252402) – NineWasps Mar 28 '16 at 12:26
  • Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/107549/discussion-between-ldevyataykina-and-maxu). – NineWasps Mar 28 '16 at 13:11
  • Can you say, what packages I should have to setup `seaborn`? – NineWasps Mar 28 '16 at 20:46
  • @ldevyataykina, follow installation instructions: https://github.com/mwaskom/seaborn/blob/master/README.md – MaxU - stand with Ukraine Mar 28 '16 at 21:06
  • I write to you in our chat – NineWasps Mar 28 '16 at 21:35
  • I can't print years in order to bring. I get `26 2014 166 am.ru and cars.mail.ru/sale 25 2014 223 am.ru and irr.ru/cars 45 2015 224 auto.ru and am.ru 19 2014 408 auto.ru and am.ru` when I `all_common_users = df[['ID','address']].groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index() common_users = all_common_users.groupby(all_common_users.index / 2).first().sort_values('pair of websites') ` All code in chat – NineWasps Mar 29 '16 at 12:38
  • and can you see on graph to some pair, for example: `el1 -> drom.ru` Why `87` more than `191`? There is an error – NineWasps Mar 29 '16 at 12:58
  • @ldevyataykina, i've fixed an error with wrong annotations – MaxU - stand with Ukraine Mar 29 '16 at 20:53
  • can you help me with one more question, please? – NineWasps Mar 30 '16 at 12:41
  • @ldevyataykina, it depends on the question ;) – MaxU - stand with Ukraine Mar 30 '16 at 14:29
  • I try to print some graph with this code, but I can't sort strings and print it at graph in sequence that I want – NineWasps Mar 30 '16 at 14:55
  • `def f(df): dfs = [] for x in [list(x) for x in itertools.combinations(df['address'].unique(), 2)]: c1 = df.loc[df['address'].isin([x[0]]), 'ID'] c2 = df.loc[df['address'].isin([x[1]]), 'ID'] c = pd.Series(list(set(c1).intersection(set(c2)))) #add inverted intersection c2 vs c1 c_invert = pd.Series(list(set(c2).intersection(set(c1)))) dfs.append(pd.DataFrame({'common users':len(c), 'pair of websites':' and '.join(x)}, index=[0]))` – NineWasps Mar 30 '16 at 14:56
  • continue `#swap values in x x[1],x[0] = x[0],x[1] dfs.append(pd.DataFrame({'common users':len(c_invert), 'pair of websites':' and '.join(x)}, index=[0])) return pd.concat(dfs) common_users = df.groupby([df['used_at'].dt.year]).apply(f).reset_index(drop=True, level=1).reset_index()` – NineWasps Mar 30 '16 at 14:56
  • I want to print graph like you do. But it prints awful – NineWasps Mar 30 '16 at 14:57
  • sorry, it's impossible to read - you should open another question – MaxU - stand with Ukraine Mar 30 '16 at 14:57
  • http://stackoverflow.com/questions/36112425/sort-strings-by-pandas-and-print-that-to-graph i add it there – NineWasps Mar 30 '16 at 15:10
0

Try seaborn

From https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.barplot.html

import seaborn as sns
sns.set_style("whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="day", y="total_bill", data=tips)

For stacked bar plot: https://gist.github.com/randyzwitch/b71d47e0d380a1a6bef9

user308827
  • 21,227
  • 87
  • 254
  • 417