Study of the temporal evolution of a corpus through word embedding

Author(s):
License:

GNU-GPLv3+

Project:

https://en.wikiversity.org/wiki/The_dynamics_and_social_organization_of_innovation_in_the_field_of_oncology

Reference repository for this file:

https://gitlab.com/solstag/oncologister-pvt

Contributions are welcome, get in touch with the author(s).

In [1]:
# This notebook requires Pandas, Gensim and Lxml
import re, multiprocessing, numpy, json, os, sys, math
import pandas, gensim
from collections import defaultdict, namedtuple, OrderedDict
from copy import deepcopy
import matplotlib.pyplot as plt
from lxml import html
%matplotlib inline
In [2]:
# Options used to run the notebook
opt_your_initials = 'aha' # SET THIS BEFORE RUNNING, i.e. 'aha', 'jpc', ...
opt_data_source = 'getpapersdir'
opt_training_mode = 'randomfill'
opt_model_iter = 5
opt_name_extra = ''
opt_model_name = '-'.join( [opt_your_initials, opt_data_source, opt_training_mode, str(opt_model_iter)] )
opt_model_name += '-' + opt_name_extra if opt_name_extra else ''
opt_load_models = True
In [3]:
# Source abstracts from JSON files generated by esmo.py
def get_esmo(ayears, esmo_path):
    # get converted PDF files
    for year in os.listdir(esmo_path):
        year_path = os.path.join(esmo_path,year,'xml')
        for sec in os.listdir(year_path):
            if sec.endswith('.json'):
                sec_path = os.path.join(year_path,sec)
                with open(sec_path) as sec_file:
                    content = json.load(sec_file)
                ayears[int(year)].abstracts.extend( [ l['text'] for l in item['abstract'] ]
                                                    for item in content if item['separator']!=None )
    # get converted HTML files
    for year in os.listdir(esmo_path):
        year_path = os.path.join(esmo_path,year,'content')
        for sec in os.listdir(year_path):
            if sec.endswith('.json'):
                sec_path = os.path.join(year_path,sec)
                with open(sec_path) as sec_file:
                    content = json.load(sec_file)
                for a in content['abstracts']:
                    htext = html.fromstring(a['text'])
                    htext.find('h2').getparent().remove(htext.find('h2'))
                    abstract = [ a['title'],
                                 ' '.join(a['authors'].keys()),
                                 ' '.join(sum(a['authors'].values(),[])),
                                 htext.text_content() ]
                    ayears[int(year)].abstracts.append( abstract )

# Source abstracts from JSON files generated by ContentMine's getpapers.js
def get_getpapers(ayears, getpapers_path):
    names = [n for n in os.listdir(getpapers_path) if os.path.isdir(os.path.join(getpapers_path, n))]
    noabstract = []
    for name in names:
        with open(os.path.join(getpapers_path, name, 'eupmc_result.json')) as f:
            content = json.load(f)
        try:
            year = int(content["journalInfo"][0]["yearOfPublication"][0])
            abstract = content["abstractText"][0]
            pmid = content["id"][0]
            citation = int(content["citedByCount"][0])            
            if any( map( lambda x: len(x)!=1,
                         (content["abstractText"], content["id"], content["citedByCount"],
                          content["journalInfo"][0]["yearOfPublication"]) ) ):
                raise Exception('Non-unique item!')
            if not len(abstract):
                raise Exception('Empty abstract!')
            ayears[year].abstracts.append([abstract])
            ayears[year].pmids.append(pmid)
            ayears[year].citations.append(citation)
        except KeyError:
            noabstract.append(name)
    if noabstract:
        print('In <'+getpapers_path+'> missing "abstractText":', ', '.join(noabstract))
In [4]:
# Create the main data structure
ArticleSeries = namedtuple('Articles', ['abstracts', 'pmids', 'citations'])
ayears = defaultdict(lambda: ArticleSeries([], [], []))

# ESMO abstracts from esmo.py
if opt_data_source=='esmo':
    esmo_path='../data/esmo/auto_esmo'
    get_esmo(ayears, esmo_path)

# Getpapers single dir
if opt_data_source=='getpaperssingle':
    getpapers_path = '/home/aedesaegypti/contentmine/papers/diarrhea_angola_orig/'
    get_getpapers(ayears, getpapers_path)

# Getpapers dir store
if opt_data_source=='getpapersdir':

    getpapers_store = '../data/gotpapers/'
    getpapers_re    = re.compile('gp-mesh_Breast_Neoplasms-research_article-en-')
    getpapers_paths = [os.path.join(getpapers_store, name)
                         for name in os.listdir(getpapers_store)
                           if os.path.isdir(os.path.join(getpapers_store, name)) \
                             and getpapers_re.match(name)]
    # Tell us where we'll look for papers
    print('## Abstract paths:')
    for path in getpapers_paths:
        print(path)
    print()

    print('## Defective abstracts:')
    for getpapers_path in getpapers_paths:
        get_getpapers(ayears, getpapers_path)
    print()

# Transform into OrderedDict of DataFrames
ayears = OrderedDict( sorted( ayears.items() ) )
for year in ayears:
    ayears[year] = pandas.DataFrame( ayears[year]._asdict() )

# Tell us what we've got
print('## Abstract count for {}:'.format(opt_data_source))
for year in ayears:
    print(year, len(ayears[year]))
print('Total:', sum( map( len, ayears.values() ) ), '\n')
## Abstract paths:
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1991
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1992
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1993
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1994
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1995
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1996
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1997
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1998
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1999
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2000
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2001
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2002
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2003
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2004
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2005
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2006
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2007
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2008
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2009
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2010
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2011
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2012
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2013
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2014
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2015
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2016

## Defective abstracts:
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1997> missing "abstractText": PMC3950102, PMC3950105
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1998> missing "abstractText": PMC3950056
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2001> missing "abstractText": PMC4809519
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2002> missing "abstractText": PMC4052987
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2003> missing "abstractText": PMC3157308, PMC140764, PMC2933200
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2004> missing "abstractText": PMC2750770
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2007> missing "abstractText": PMC2409269, PMC1963418
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2008> missing "abstractText": PMC2605101, PMC2921443, PMC2577595, PMC2662631, PMC2910511, PMC2613431, PMC2637546, PMC2692037, PMC2597463, PMC2613188, PMC2742969, PMC2397515, PMC2613432, PMC2613189, PMC2613186, PMC2701509, PMC2646902
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2009> missing "abstractText": PMC2797682, PMC2827040, PMC3462220, PMC2998989, PMC4059389, PMC3050000, PMC4276027, PMC4828958, PMC4119170, PMC3085997, PMC2884369, PMC2721826, PMC3086398, PMC2792756, PMC2710299, PMC2927217, PMC2701308, PMC2940713, PMC2702716, PMC2762202, PMC2805406, PMC3432638
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2010> missing "abstractText": PMC3046437, PMC2854631, PMC3086479, PMC3727643, PMC2972547, PMC3532890, PMC4633045, PMC3086476, PMC3005737, PMC2831800, PMC3011974, PMC3891888, PMC2802109, PMC3069698, PMC2980856, PMC3910096
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2011> missing "abstractText": PMC4291023, PMC3070703, PMC3268011, PMC3154075, PMC3109560, PMC3173545, PMC3288213, PMC4438279, PMC3109583, PMC3533368, PMC2944904, PMC3109561, PMC3139707, PMC3181002, PMC3085082, PMC2993828, PMC3063418, PMC3285267, PMC3549592
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2012> missing "abstractText": PMC4105700, PMC4086916, PMC3459983, PMC3926433, PMC3618958, PMC4005330, PMC3384020, PMC4529025, PMC4816219, PMC3760188, PMC3285417, PMC4084672
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2013> missing "abstractText": PMC3919429, PMC4116686, PMC4010071, PMC4495903, PMC3960406, PMC3965749, PMC3728631, PMC3745823, PMC4084801, PMC4174342, PMC3782420, PMC3631073, PMC3628101, PMC3639538, PMC3978629, PMC3639537, PMC3639541, PMC3825305, PMC3755937, PMC3962824, PMC3845670, PMC3662856
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2014> missing "abstractText": PMC4423750, PMC4221801, PMC4422409, PMC4242865, PMC4176456, PMC4159115, PMC4282611, PMC4303193, PMC4303197, PMC4132248, PMC4132219, PMC4467453, PMC4076619, PMC4272910, PMC4828656, PMC4704099, PMC4169670, PMC3926795, PMC4137837, PMC4137836, PMC4147833, PMC3976704, PMC4303350, PMC4254381, PMC4167984, PMC3908999, PMC4560354, PMC4271782, PMC3887419, PMC4011400, PMC4081638, PMC4053154, PMC4328123, PMC4235676, PMC4052563, PMC3926784, PMC4243053, PMC4677672, PMC4053237, PMC3926792
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2015> missing "abstractText": PMC4691585, PMC4691584, PMC4691586, PMC4842308, PMC4510219, PMC4569197, PMC4569215, PMC4564307, PMC4477195, PMC4511377, PMC4822528, PMC4412310, PMC4589306, PMC4708077, PMC4503333, PMC4521403, PMC4309631, PMC4610139, PMC4507296, PMC4347900, PMC4603827
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2016> missing "abstractText": PMC4920648

## Abstract count for getpapersdir:
1991 153
1992 152
1993 191
1994 189
1995 219
1996 262
1997 289
1998 305
1999 304
2000 282
2001 339
2002 281
2003 347
2004 421
2005 480
2006 552
2007 812
2008 1499
2009 1951
2010 2349
2011 2704
2012 3321
2013 3785
2014 4324
2015 3976
2016 1090
Total: 30577 

In [5]:
# Split words and do some cleaning
def linestowords(document):
    splitter = re.compile('[%=:;,"\'\(\)\[\]\{\}\<\>\.]+')
    words = [word
             for line in document
             for inter in line.casefold().split()
             for dirty in splitter.split(inter)
             for word in [dirty.strip('.-')]
             if len(word)>1 and not word.isdigit()]
    return words

for year in ayears:
    ayears[year]['abstracts'] = [ linestowords(abstract) for abstract in ayears[year]['abstracts'] ]
    ayears[year] = ayears[year][ ayears[year]['abstracts'].apply(bool) ]
    ayears[year].index = range( len( ayears[year] ) )

# Identify and join "bigrams" so ['são', 'paulo'] becomes ['são_paulo']
bigram = gensim.models.phrases.Phrases( [ abstract
                                          for year in ayears
                                          for abstract in ayears[year]['abstracts'] ] )

for year in ayears:
    ayears[year]['abstracts'] = [ bigram[abstract] for abstract in ayears[year]['abstracts'] ]
In [6]:
yearmodels=dict()

if opt_load_models:
    # Load existing models from saved files
    for year in ayears:
        model_path = '-'.join(['w2v', opt_model_name, str(year)])
        if os.path.exists( model_path ):
            yearmodels[year] = gensim.models.Word2Vec.load( model_path )

missing_years = set(ayears).difference( set(yearmodels) )

if missing_years:
    basemodel_path = '-'.join(['w2v', opt_model_name, 'base'])
    
    if os.path.exists( basemodel_path ):
        # Load existing basemodel
        basemodel = gensim.models.Word2Vec.load( basemodel_path )
    
    else:
        # Create the base model, hs=1 and negative=0 are required by .score()
        basemodel = gensim.models.Word2Vec(workers=multiprocessing.cpu_count(),
                                           iter = opt_model_iter,
                                           hs=1, negative=0
                                           )
        
        # Create a copy of the abstracts as the original is needed for analysis
        model_ayears = deepcopy(ayears)

        # Shuffle abstracts, because order matters in training
        for year in model_ayears:
            index = list(model_ayears[year].index)
            numpy.random.shuffle(index)
            model_ayears[year].index = index
            model_ayears[year].sort_index(inplace=True)

        # Balance abstracts if we chose to
        if opt_training_mode == 'nothing':
            pass
        elif opt_training_mode == 'randomfill':
            # Fill them up to match the largest corpus to see if this fixes the size bias
            maxsize = max( map( len, model_ayears.values() ) )
            for year in model_ayears:
                size = len(model_ayears[year])
                factor = int(maxsize/size)
                rest = maxsize - size*factor
                model_ayears[year] = pandas.concat( [ model_ayears[year] for i in range(factor) ] \
                                                    + [ model_ayears[year][:rest] ], ignore_index=True )
        elif opt_training_mode == 'randomsample':
            # Cut to same size to see if this fixes the size bias
            minsize = 1000 # min( map(len, model_ayears.values() )) -> 96 and 98 were strong
            for year in model_ayears:
                model_ayears[year] = model_ayears[year][:minsize]
        else:
            raise Exception('opt_training_mode not found: ' + opt_training_mode)

        # Build common vocabulary from all years
        basemodel.build_vocab( pandas.concat( [model_ayears[year]['abstracts'] for year in model_ayears],
                                              ignore_index=True ) )
        
        # Save the basemodel
        basemodel.save( basemodel_path )

    print(basemodel)

    # Create a model for each year and train it
    for year in sorted(missing_years):
        yearmodels[year] = deepcopy(basemodel)
        yearmodels[year].train(model_ayears[year]['abstracts'], total_examples=len(model_ayears[year]))
        yearmodels[year].save( '-'.join(['w2v', opt_model_name, str(year)]) )
Word2Vec(vocab=72724, size=100, alpha=0.025)
In [7]:
def calc_probs(ayears, yearmodels):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [ abstract for year in ayears for abstract in ayears[year]['abstracts'] ]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = numpy.array( [ yearmodels[year].score(sentlist, len(sentlist))
                          for year in ayears ] )
    # now exponentiate to get likelihoods, 
    lhd = numpy.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    prob = pandas.DataFrame( (lhd/lhd.sum(axis=0)).transpose(), columns=ayears )
    # and finally average the sentence probabilities to get the review probability
    prob['year'] = [ year for year in ayears for abstract in ayears[year]['abstracts'] ]
    prob['pos'] = pandas.concat( [ ayears[year].index.to_series() for year in ayears ], ignore_index=True)
    return prob

def calc_probs2(ayears, yearmodels):
    for year_a in ayears:
        # the log likelihood of each sentence in this review under each w2v representation
        llhd = pandas.DataFrame( ( yearmodels[year_m].score( ayears[year_a]['abstracts'], len(ayears[year_a]) )
                                   for year_m in ayears ),
                                index=yearmodels)
        # now exponentiate to get likelihoods, 
        lhd = numpy.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
        # normalize across models (stars) to get sentence-star probabilities
        plhd = (lhd/lhd.sum(axis=0)).T
        ayears[year_a] = pandas.concat( [ayears[year_a], plhd], axis=1)
In [8]:
#probs = calc_probs(ayears, yearmodels)
calc_probs2(ayears, yearmodels)
In [9]:
ayears[1992]
Out[9]:
abstracts pmids citations 1991 1992 1993 1994 1995 1996 1997 ... 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
0 [method_called, backprocessing, for, subgross,... 1479041 9 9.892514e-263 1.0 2.172550e-270 1.369309e-274 8.490872e-286 3.635805e-244 4.820641e-254 ... 3.420540e-208 4.229656e-193 3.971670e-191 1.553462e-197 4.416977e-193 4.362440e-192 5.577414e-191 2.436741e-191 4.357650e-192 3.289794e-198
1 [oestrogen_receptors, er, in, breast_cancer, t... 1457348 2 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
2 [deposition, of, basement_membrane, extracellu... 1299240 0 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
3 [objective, the, aim, was, to_determine, what,... 1479324 7 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
4 [the, importance, of, axillary_dissection, as_... 1637663 194 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 2.884840e-311 2.346347e-300 1.387546e-302 6.606067e-301 3.232669e-295 1.210108e-297 3.573000e-295 1.023737e-293 1.515019e-293 6.239757e-298
5 [the, alpha, 6/beta, integrin, complex, has_be... 1503905 49 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
6 [histopathology_reports, on, mastectomy, and, ... 1541706 12 0.000000e+00 1.0 2.964394e-323 2.508187e-306 0.000000e+00 1.443593e-261 2.435130e-291 ... 7.289238e-243 4.114960e-236 1.544306e-232 1.997964e-237 2.260523e-235 1.042233e-235 5.478083e-234 2.670020e-240 6.704015e-239 1.927099e-243
7 [significant_proportion, of, patients, with, c... 1433060 8 1.256367e-198 1.0 1.517296e-212 1.625363e-198 1.898001e-201 5.825488e-177 2.407137e-200 ... 1.165672e-150 2.622165e-151 1.420464e-148 1.826152e-150 1.433713e-150 7.397912e-150 3.466980e-147 7.183248e-151 1.687724e-149 2.190232e-151
8 [carcinoma, of, the, breast, is, the, most_com... 1736805 4 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
9 [the, measurement, of, quality, of, life, in, ... 1467195 5 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
10 [study, was, started, in, whereby, patients, w... 1739625 62 0.000000e+00 1.0 4.940656e-324 4.012216e-314 0.000000e+00 6.824980e-285 2.372061e-283 ... 9.256081e-240 1.140489e-227 3.479805e-230 1.424888e-230 8.152866e-225 1.452657e-225 2.069365e-224 7.385107e-226 7.963625e-229 1.582756e-240
11 [the, anti-pan, carcinoma, monoclonal_antibody... 1520585 27 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
12 [cathepsin, expression, was, assessed, by_immu... 1332483 24 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 9.339644e-300 3.293408e-304 4.218393e-305 6.422607e-302 2.104064e-306 6.365109e-305 2.786751e-306 9.261805e-307 8.292661e-309
13 [mammary, hamartomas, are, uncommon, breast, l... 1430271 10 1.119343e-183 1.0 1.009729e-164 5.634978e-186 3.652409e-173 7.646292e-174 7.672522e-176 ... 2.851548e-137 2.682682e-134 1.484872e-135 6.478947e-133 6.274206e-131 1.086580e-133 2.298318e-132 8.657924e-133 1.848392e-133 1.139704e-134
14 [aims, to, report, an_unusual, case, of, invas... 1331198 9 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
15 [objective, to_investigate, the, risk, of, pro... 1422397 45 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
16 [the, influence, of, liver, biochemistry, test... 1419619 18 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
17 [tamoxifen, has_been, implicated, as, risk, fa... 1419616 8 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 1.885501e-307 1.107362e-311 9.012809e-307 4.019804e-304 1.832057e-305 4.581244e-304 6.185089e-304 5.061352e-311 0.000000e+00
18 [thirty, elderly_patients, with, t3, or, t4, b... 1494511 2 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 3.097009e-306 4.045685e-279 2.555754e-284 1.293535e-275 5.429316e-280 1.274379e-276 5.714316e-275 4.718577e-275 7.595174e-282 2.202031e-290
19 [aims, to_determine, whether, the, recommended... 1624600 0 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 4.760154e-307 1.988746e-306 1.561388e-301 7.175563e-298 1.200508e-297 5.068123e-300 3.297175e-299 2.052517e-299 2.204809e-302 2.420922e-322
20 [aims, to, produce, simplified, prognostic_ind... 1624599 12 1.896524e-307 1.0 8.659930e-300 1.606778e-301 4.940656e-324 1.907318e-316 6.982936e-301 ... 2.793808e-231 8.439255e-237 1.091891e-237 1.440149e-231 1.297727e-229 3.317480e-236 8.342524e-230 1.648611e-229 7.133609e-232 7.039367e-234
21 [from, breast, cancers, oestrogen_receptor, er... 1616860 7 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 1.956574e-302 1.065469e-286 2.356004e-287 1.686298e-281 1.315902e-282 2.928679e-276 2.814286e-280 7.339015e-280 1.850403e-282 1.869054e-297
22 [the, oestrogen_receptor, er, is, considered, ... 1419596 5 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
23 [monoclonal_antibody, mab_br, 15-6a, directed_... 1384341 3 0.000000e+00 1.0 0.000000e+00 0.000000e+00 1.967118e-317 1.500420e-295 4.063026e-285 ... 7.839408e-260 3.590682e-242 1.229473e-239 8.497785e-246 5.568678e-243 2.033032e-243 1.622106e-242 3.332931e-243 3.348654e-245 3.426389e-254
24 [adults, aged, and, older, represent, an, incr... 1595273 3 3.043036e-199 1.0 4.225245e-182 1.285842e-183 1.517878e-194 1.528385e-176 1.489816e-162 ... 4.627332e-124 4.095519e-119 1.359836e-122 7.634436e-120 1.151116e-120 1.497896e-118 3.254169e-117 2.547920e-117 1.277474e-119 3.212614e-125
25 [in, retrospective_cohort, study, the, mothers... 1520598 5 1.833799e-297 1.0 1.600863e-247 1.173085e-249 4.181393e-271 1.532772e-263 3.344583e-214 ... 6.817691e-211 5.451605e-190 1.074425e-187 1.373949e-185 1.925579e-189 4.492101e-185 1.721111e-185 5.536218e-187 1.132888e-189 2.641681e-197
26 [the, icelandic, cancer, registry, has, collec... 1552557 26 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
27 [objective, to, study, the, occurrence, of, br... 1393887 7 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 4.774069e-313 2.204487e-304 8.079549e-308 1.731542e-305 1.345366e-306 2.569767e-304 1.461312e-305 1.743988e-302 1.436726e-305 0.000000e+00
28 [the, psychometric_properties, of, method, of,... 1616871 8 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
29 [information, on, non-hispanic_white, black, a... 1616865 78 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 2.603788e-313 1.328102e-295 3.280452e-284 7.016334e-295 3.547828e-298 5.405683e-299 9.769199e-295 1.196907e-293 1.511114e-297 1.808541e-302
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
104 [to_evaluate, the, natural_history, of, diseas... 1419642 18 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
105 [thrombospondin, tsp, which, plays_an, importa... 1351369 39 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
106 [linkage, of, early-onset_familial, breast, an... 1598904 115 1.856698e-280 1.0 1.005144e-218 1.604168e-303 3.113821e-259 1.158226e-284 2.641308e-242 ... 1.463497e-217 2.097986e-211 1.133299e-211 8.223301e-210 1.415118e-212 2.655252e-210 8.860563e-217 5.227824e-218 5.179339e-215 6.666247e-224
107 [we, have, examined, two, new, oestrogen_recep... 1534019 25 2.818990e-259 1.0 4.124002e-263 1.964931e-262 3.154972e-284 1.394592e-265 1.593811e-250 ... 9.155180e-220 4.955023e-212 1.031820e-207 8.437702e-209 5.349638e-202 1.079653e-206 3.713535e-204 2.191678e-203 4.413403e-202 1.841014e-211
108 [the, neoantigens, of, the, c5b-9, complement,... 1374587 48 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 6.556251e-321 4.197486e-316 1.267909e-315 2.725103e-315 3.771392e-311 5.596633e-313 3.384186e-313 2.795750e-312 5.346867e-313 0.000000e+00
109 [we, previously_reported, that, the, expressio... 1558805 2 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
110 [chess, the, comprehensive, health, enhancemen... 1482860 9 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
111 [temozolomide, ccrg, m&b, nsc, is, an, analogu... 1739631 176 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
112 [this_study, was, aimed_at, determining, wheth... 1348424 9 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
113 [the, glycoproteins, granule, membrane, protei... 1372439 33 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
114 [we, studied, the, nature, of, insulin_recepto... 1311720 17 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
115 [twenty-four, evaluable_patients, with, stage,... 1586604 7 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
116 [overexpression, of, the, p53, protein, result... 1373500 105 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 1.523196e-315 8.291381e-290 6.463382e-287 7.899256e-285 1.617813e-285 7.478563e-282 7.095364e-286 9.478180e-286 1.645252e-286 1.928219e-308
117 [the, effects, of, the, major, human, serum, b... 1562465 16 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
118 [the, human, gene, sequences, encoding, the, t... 1370760 11 1.769469e-231 1.0 1.893578e-243 1.089165e-210 4.070492e-208 2.825637e-196 6.293606e-211 ... 3.763809e-173 3.650585e-165 1.823581e-167 5.046728e-165 1.096416e-168 4.392887e-168 9.274738e-166 4.257616e-169 2.799001e-165 4.232623e-179
119 [cdna_library, constructed, from, mrna, from, ... 1733438 17 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
120 [of, patients, with, breast_cancer, of, known,... 1346366 131 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 2.071141e-308 2.057002e-313 9.377367e-310 5.565110e-303 3.046163e-302 1.653566e-306 6.952150e-303 3.526908e-300 4.416176e-318
121 [an, extension, of, the, usual, mixture_model,... 1539592 11 2.700849e-297 1.0 4.305952e-279 1.947607e-288 1.445891e-253 7.814120e-275 6.132927e-253 ... 6.580672e-216 5.080676e-199 1.047369e-199 2.880722e-202 9.946397e-203 4.169454e-200 4.081579e-200 8.315490e-199 2.158951e-203 1.403103e-213
122 [objective, to, examine_how, breast, cancers, ... 1547414 30 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
123 [randomised_trial, has, previously, been, repe... 1733435 10 0.000000e+00 1.0 1.135769e-311 0.000000e+00 0.000000e+00 4.940656e-324 2.643200e-297 ... 8.504078e-253 4.083321e-246 1.276779e-245 2.269384e-246 5.015934e-243 1.322625e-245 4.217515e-242 4.689941e-243 7.615264e-241 8.802223e-245
124 [survey, of, breast_cancer, and, breast_cancer... 1454981 3 4.569627e-316 1.0 4.435595e-293 2.017614e-285 5.568042e-310 8.183581e-287 4.079006e-300 ... 2.531560e-227 9.706767e-225 5.268040e-219 1.003379e-222 2.843667e-223 4.064535e-222 9.468804e-222 2.528198e-222 2.823337e-222 7.176758e-233
125 [this_study, determined, the, distribution, pa... 1443054 8 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
126 [number, of, growth, factors, have_been, impli... 1419600 32 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 3.585471e-315 1.445560e-313 1.728865e-317 5.209403e-318 4.414694e-318 6.465032e-318 2.366574e-321 8.942588e-322 0.000000e+00
127 [the, expression, of, basic_fibroblast, growth... 1380281 76 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
128 [the, murine, monoclonal_antibody, mumab4d5, d... 1350088 572 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
129 [the, vitamin, hormone, retinoic_acid, ra, reg... 1328857 89 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
130 [subtractive_hybridization, selecting, for, mr... 1324944 91 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
131 [five, hexapeptide, and, heptapeptide, analogs... 1332035 9 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
132 [twenty-seven, cases, of, inflammatory, breast... 1353891 225 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
133 [the, her2, protooncogene, encodes, 185-kd, tr... 1346155 60 0.000000e+00 1.0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00

134 rows × 29 columns

In [10]:
def gen_plots(ayears):
    probs = list(range(1991,2017))

    # Each subplot is the model trained from that year crossed with the corpus for each year
    for year in ayears:
        ayears[year]['year']=year
    allyears = pandas.concat( [ayears[year] for year in ayears] )
    axs = allyears.boxplot( probs, by='year', whis=[5,95], showmeans=1, layout=(9,3),
                          figsize=(17,17), return_type='axes')
    for ax in axs.values():
        ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])

    # Each subplot is the corpus for that year crossed with each of the models
    plt.figure(figsize=(17,17))
    for i, target in enumerate( ayears ):
        plt.subplot(9,3,i+1, label=str(target))
        ax = ayears[target].boxplot( probs, whis=[5,95], showmeans=1, return_type='axes')
        ax.set_title(str(target))
        ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
    plt.tight_layout()

    # Histogram of abstracts per year
    plt.figure(figsize=(17,10))
    plt.bar(ayears, [len(ayears[years]) for years in ayears])
    plt.title('Number of abstracts per year')
In [11]:
gen_plots(ayears)
In [19]:
def calc_ratios(ayears):
    # Calculate the ratio between an abstracts likelihood in its own year and on the next
    nextyear = dict( (y, list(ayears)[i+1]) for i, y in enumerate(list(ayears)[:-1]) )
    prevyear = dict( (y, list(ayears)[i-1]) for i, y in enumerate(list(ayears)[1:]) )

    for year in list(ayears)[:-1]:
        ayears[year]['rnext'] = ayears[year].loc[ :, nextyear[year] ] / ayears[year].loc[ :, year ]
    for year in list(ayears)[1:]:
        ayears[year]['rprev'] = ayears[year].loc[ :, year ] / ayears[year].loc[ :, prevyear[year] ]
    for year in list(ayears)[1:-1]:
        ayears[year]['rjump'] = ayears[year].loc[ :, nextyear[year] ] / ayears[year].loc[ :, prevyear[year] ]

def find_ratios(ayear, year, ratio='rjump'):
    # Find the 10 largest ratios for a given year
    return ayears[year][ratio].sort_values(ascending=False)[:10]

def print_ratios(ayears):
    for year in list(ayears)[1:-1]:
        print(year, '\n', find_ratios(ayears, year), '\n')
In [20]:
calc_ratios(ayears)
print_ratios(ayears)
1992 
 106    1.507810e+05
54     2.659418e-15
40     1.607405e-22
96     1.239216e-28
13     8.859577e-31
94     8.359975e-50
42     2.337122e-50
25     6.060018e-51
107    2.240071e-52
78     6.379852e-54
Name: rjump, dtype: float64 

1993 
 2               inf
146             inf
34              inf
136             inf
41              inf
61              inf
11     1.852934e+80
113    8.174425e+47
166    1.145025e+40
86     1.417681e+39
Name: rjump, dtype: float64 

1994 
 50              inf
154             inf
168             inf
150             inf
67              inf
59              inf
61              inf
49     1.430782e+72
169    2.201745e+54
151    2.119695e+49
Name: rjump, dtype: float64 

1995 
 91     inf
197    inf
141    inf
142    inf
101    inf
149    inf
152    inf
154    inf
105    inf
80     inf
Name: rjump, dtype: float64 

1996 
 149    inf
145    inf
148    inf
53     inf
150    inf
28     inf
123    inf
164    inf
117    inf
18     inf
Name: rjump, dtype: float64 

1997 
 91     inf
126    inf
117    inf
115    inf
114    inf
106    inf
169    inf
172    inf
176    inf
101    inf
Name: rjump, dtype: float64 

1998 
 104    inf
89     inf
134    inf
194    inf
256    inf
102    inf
203    inf
211    inf
146    inf
144    inf
Name: rjump, dtype: float64 

1999 
 185             inf
171             inf
10              inf
47     5.603510e+54
163    5.893483e+33
231    1.305438e+29
77     9.581450e+27
263    9.580344e+25
99     3.759560e+21
134    8.226404e+20
Name: rjump, dtype: float64 

2000 
 149             inf
242             inf
131             inf
166             inf
80              inf
220    1.780273e+60
84     1.065265e+58
180    1.068795e+45
181    2.429191e+44
79     8.237750e+39
Name: rjump, dtype: float64 

2001 
 198             inf
280             inf
189             inf
231             inf
99     9.605496e+33
5      8.772599e+27
247    5.423513e+25
13     3.506435e+21
58     3.589830e+19
297    2.473819e+18
Name: rjump, dtype: float64 

2002 
 263    inf
205    inf
42     inf
76     inf
83     inf
261    inf
110    inf
123    inf
125    inf
133    inf
Name: rjump, dtype: float64 

2003 
 332    inf
65     inf
253    inf
252    inf
240    inf
229    inf
223    inf
51     inf
54     inf
64     inf
Name: rjump, dtype: float64 

2004 
 306    inf
287    inf
263    inf
192    inf
266    inf
274    inf
275    inf
191    inf
183    inf
279    inf
Name: rjump, dtype: float64 

2005 
 228    inf
243    inf
274    inf
273    inf
261    inf
260    inf
36     inf
251    inf
247    inf
240    inf
Name: rjump, dtype: float64 

2006 
 437    inf
54     inf
447    inf
448    inf
99     inf
382    inf
56     inf
164    inf
263    inf
299    inf
Name: rjump, dtype: float64 

2007 
 182    inf
653    inf
224    inf
539    inf
540    inf
548    inf
201    inf
728    inf
577    inf
610    inf
Name: rjump, dtype: float64 

2008 
 1417              inf
1450    2.817030e+136
1299    1.511738e+132
299     7.450019e+127
531     5.992834e+118
1358    2.051136e+117
238     2.181882e+109
1186    2.297660e+107
1085    5.024407e+105
833     2.984139e+105
Name: rjump, dtype: float64 

2009 
 1765    2.446474e+94
1593    1.712621e+83
1886    3.432620e+78
579     2.057231e+78
1557    2.016592e+76
1652    9.952296e+70
768     6.211284e+70
538     1.473150e+70
1779    9.354770e+69
1054    3.845287e+69
Name: rjump, dtype: float64 

2010 
 2133    6.448458e+43
339     2.409024e+43
887     7.310231e+41
1465    2.859195e+36
2209    2.261257e+36
321     1.929440e+36
1613    4.342107e+35
1531    9.848332e+34
1128    4.126498e+34
204     6.751141e+33
Name: rjump, dtype: float64 

2011 
 2432    1.091061e+67
2639    5.517378e+41
1408    6.131499e+39
2505    4.862271e+39
1785    1.091828e+39
695     7.629634e+38
2088    2.626308e+36
1269    7.038477e+35
2535    4.328347e+35
2346    1.549738e+35
Name: rjump, dtype: float64 

2012 
 2188    7.987114e+34
1373    5.621542e+33
2686    2.027995e+32
1546    5.709602e+30
1966    9.906068e+29
2508    3.980488e+29
964     3.665655e+29
442     1.823290e+29
1269    4.961530e+28
1960    3.018518e+28
Name: rjump, dtype: float64 

2013 
 1238    5.935524e+43
1902    2.355868e+35
2377    6.610871e+33
977     1.205383e+28
3331    3.231201e+27
1347    1.614819e+26
2984    1.023564e+26
1728    2.834365e+25
2967    1.519629e+25
593     9.065813e+24
Name: rjump, dtype: float64 

2014 
 956     6.362369e+56
3082    2.304389e+44
1528    4.433579e+43
971     8.776174e+33
2864    5.756258e+33
555     7.056829e+32
3598    3.568813e+32
599     2.374377e+31
1588    9.269862e+30
1069    3.145453e+30
Name: rjump, dtype: float64 

2015 
 2525    2.346465e+54
2316    2.855980e+44
3149    1.159159e+40
599     8.174641e+39
3761    3.537345e+34
3056    5.113337e+32
1835    1.283263e+32
1148    1.142054e+32
942     7.751670e+29
1856    1.598281e+29
Name: rjump, dtype: float64 

In [21]:
def print_corr(ayears, ratio='rjump'):
    if False:
        for year in list(ayears)[1:-1]:
            print('\n\n', year, '\n')
            cyears = ayears[year][[ratio,'citations']]
            print('\n\npearson\n', cyears.corr())
            print('\n\kendall\n', cyears.corr('kendall'))
            print('\n\spearman\n', cyears.corr('spearman'))
        
    plt.figure(figsize=(17,17))
    for i, target in enumerate( list(ayears)[1:-1] ):
#        plt.subplot(8, 3,i+1, label=str(target))
        cyears = ayears[target][[ratio,'citations']]
        ax = cyears.plot( 'citations', ratio, kind='scatter', logy=True)
        ax.set_title(str(target))
#        ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
#    plt.tight_layout()

print_corr(ayears)
/srv/lisis-lab/devroot/home/ale/.local/lib/python3.5/site-packages/matplotlib/pyplot.py:516: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
<matplotlib.figure.Figure at 0x7f9c74de62e8>
In [22]:
print_corr(ayears, 'rnext')
/srv/lisis-lab/devroot/home/ale/.local/lib/python3.5/site-packages/matplotlib/pyplot.py:516: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
<matplotlib.figure.Figure at 0x7f9c75efd470>
In [15]:
# Some inspection functions

def readindex(year, index):
    # Reads the abstract with given index for the year given above
    print( ' '.join(ayears[year]['abstracts'][index]) )

def wordinfo(word):
    # Print count and similar words for a given word
    for year in ayears:
        m=yearmodels[year]
        print(year)
        print('Count: ', m.vocab[word].count)
        print(m.most_similar(word, topn=3))

# Check if some abstracts are too small or too big
def list_abstracts_with_length(ayears, low=100, high=sys.maxsize):
    for year in ayears:
        selected=[]
        for i, item in enumerate(ayears[year]['abstracts']):
            if len(item) < low or len(item)>high:
                selected.append((i,item))
        print('len', year, [ len(x[1]) for x in selected])
        print('idx', year, [ x[0] for x in selected])
        print()

def wordstuff():
    # Creates a list of words sorted by their total count
    words = [ w for w, c in sorted(basemodel.vocab.items(), key=lambda x: x[1], reverse=True) ]

    # Log plot of the word count in the order set above
    plt.bar( [i for i in range(len(words))], numpy.log([basemodel.vocab[w].count for w in words]) )

    # Prints some words with extreme counts:
    for w in words[:10]+words[-10:]:
        print(w, basemodel.vocab[w].count)
In [ ]: