import pandas as pd
import matplotlib.pylab as plt
import numpy as np
import matplotlib.cm as cm
import seaborn
seaborn.set()
%matplotlib inline
### Loading dataframe
df_paper = pd.read_json('./paper_df.json')
df_author = pd.read_json('./author_df.json')
df_inst = pd.read_json('./inst_df.json')
df_country = pd.read_json('./country_df.json')
df_paper = df_paper.drop([0]) # has nan
# All data are available at https://github.com/pajotarthur/ICLR_data
groupby_decision = df_paper.groupby('decision')
groupby_decision.aggregate(np.average).review
ag.plot(kind = 'bar', colormap = cm.Accent, width = 1,figsize=(15,10),stacked=True)
plt.show()
df = pd.DataFrame({'value': df_paper.review.round(1), 'type': df_paper.decision})
df['dummy'] = 1
ag = df.groupby(['value','type']).sum().unstack()
ag.columns = ag.columns.droplevel()
ag.plot(kind = 'bar', colormap = cm.Accent, width = 1,figsize=(15,10))
plt.show()
df_author.sort_values(['nb_poster','nb_oral','nb_workshop'],ascending=False).head(25)
df_author.sort_values(['nb_paper'],ascending=False).head(10)
df_author[df_author.nb_paper > 3].sort_values(['avg_note'],ascending=False).head(10)
The institution have written 1 paper if all the authors are from the institution. Otherwise, if among 3 authors, only 1 is from the institution, they have written 1/3 of a paper. The institution have been scrapped from the author's email adress. There is probably more work to do, the code is available at https://github.com/pajotarthur/ICLR_data/blob/master/add_paper_origin.ipynb. Email ending with gmail.com have been removed.
df_inst.sort_values(['nb_poster','nb_oral','nb_workshop'],ascending=False).head(10)
df_inst.sort_values(['nb_paper'],ascending=False).head(10)
df_inst[df_inst.nb_paper > 3].sort_values(['acceptance_rate'],ascending=False).head(10)
df_inst[df_inst.nb_poster > 1][['nb_poster','nb_reject','nb_oral','nb_workshop']].sort_values(by='nb_poster').plot.bar(figsize=(15,10),legend=True,fontsize=15,stacked=True)
df_inst[df_inst.nb_poster > 1].nb_poster.sort_values().plot.bar(figsize=(15,10),legend=False,fontsize=20)
df_inst[df_inst.nb_poster > 1].acceptance_by_author.sort_values().plot.bar(figsize=(15,10),legend=False,fontsize=20)
df_inst[df_inst.nb_poster > 1].acceptance_rate.sort_values().plot.bar(figsize=(15,10),legend=False,fontsize=20)
df_country.sort_values(['nb_poster','nb_oral','nb_workshop'],ascending=False).head(10)
df_country.sort_values(['nb_paper'],ascending=False).head(10)
df_country[df_country.nb_paper > 5].sort_values(['acceptance_rate'],ascending=False).head(10)
df_country[df_country.nb_poster > 1][['nb_poster','nb_reject','nb_oral','nb_workshop']].sort_values(by='nb_poster').plot.bar(figsize=(15,10),legend=True,fontsize=15,stacked=True)
df_country[df_country.nb_paper >1].nb_paper.sort_values().plot.bar(figsize=(15,10),legend=False,fontsize=25)