Study of the temporal evolution of a corpus through word embedding¶

Contributions are welcome, get in touch with the author(s).

# This notebook requires Pandas, Gensim and Lxml
import re, multiprocessing, numpy, json, os, sys, math
import pandas, gensim
from collections import defaultdict, namedtuple, OrderedDict
from copy import deepcopy
import matplotlib.pyplot as plt
from lxml import html
%matplotlib inline

# Options used to run the notebook
opt_your_initials = 'aha' # SET THIS BEFORE RUNNING, i.e. 'aha', 'jpc', ...
opt_data_source = 'getpapersdir'
opt_training_mode = 'randomfill'
opt_model_iter = 5
opt_name_extra = ''
opt_model_name = '-'.join( [opt_your_initials, opt_data_source, opt_training_mode, str(opt_model_iter)] )
opt_model_name += '-' + opt_name_extra if opt_name_extra else ''
opt_load_models = True

# Source abstracts from JSON files generated by esmo.py
def get_esmo(ayears, esmo_path):
    # get converted PDF files
    for year in os.listdir(esmo_path):
        year_path = os.path.join(esmo_path,year,'xml')
        for sec in os.listdir(year_path):
            if sec.endswith('.json'):
                sec_path = os.path.join(year_path,sec)
                with open(sec_path) as sec_file:
                    content = json.load(sec_file)
                ayears[int(year)].abstracts.extend( [ l['text'] for l in item['abstract'] ]
                                                    for item in content if item['separator']!=None )
    # get converted HTML files
    for year in os.listdir(esmo_path):
        year_path = os.path.join(esmo_path,year,'content')
        for sec in os.listdir(year_path):
            if sec.endswith('.json'):
                sec_path = os.path.join(year_path,sec)
                with open(sec_path) as sec_file:
                    content = json.load(sec_file)
                for a in content['abstracts']:
                    htext = html.fromstring(a['text'])
                    htext.find('h2').getparent().remove(htext.find('h2'))
                    abstract = [ a['title'],
                                 ' '.join(a['authors'].keys()),
                                 ' '.join(sum(a['authors'].values(),[])),
                                 htext.text_content() ]
                    ayears[int(year)].abstracts.append( abstract )

# Source abstracts from JSON files generated by ContentMine's getpapers.js
def get_getpapers(ayears, getpapers_path):
    names = [n for n in os.listdir(getpapers_path) if os.path.isdir(os.path.join(getpapers_path, n))]
    noabstract = []
    for name in names:
        with open(os.path.join(getpapers_path, name, 'eupmc_result.json')) as f:
            content = json.load(f)
        try:
            year = int(content["journalInfo"][0]["yearOfPublication"][0])
            abstract = content["abstractText"][0]
            pmid = content["id"][0]
            citation = int(content["citedByCount"][0])            
            if any( map( lambda x: len(x)!=1,
                         (content["abstractText"], content["id"], content["citedByCount"],
                          content["journalInfo"][0]["yearOfPublication"]) ) ):
                raise Exception('Non-unique item!')
            if not len(abstract):
                raise Exception('Empty abstract!')
            ayears[year].abstracts.append([abstract])
            ayears[year].pmids.append(pmid)
            ayears[year].citations.append(citation)
        except KeyError:
            noabstract.append(name)
    if noabstract:
        print('In <'+getpapers_path+'> missing "abstractText":', ', '.join(noabstract))

# Create the main data structure
ArticleSeries = namedtuple('Articles', ['abstracts', 'pmids', 'citations'])
ayears = defaultdict(lambda: ArticleSeries([], [], []))

# ESMO abstracts from esmo.py
if opt_data_source=='esmo':
    esmo_path='../data/esmo/auto_esmo'
    get_esmo(ayears, esmo_path)

# Getpapers single dir
if opt_data_source=='getpaperssingle':
    getpapers_path = '/home/aedesaegypti/contentmine/papers/diarrhea_angola_orig/'
    get_getpapers(ayears, getpapers_path)

# Getpapers dir store
if opt_data_source=='getpapersdir':

    getpapers_store = '../data/gotpapers/'
    getpapers_re    = re.compile('gp-mesh_Breast_Neoplasms-research_article-en-')
    getpapers_paths = [os.path.join(getpapers_store, name)
                         for name in os.listdir(getpapers_store)
                           if os.path.isdir(os.path.join(getpapers_store, name)) \
                             and getpapers_re.match(name)]
    # Tell us where we'll look for papers
    print('## Abstract paths:')
    for path in getpapers_paths:
        print(path)
    print()

    print('## Defective abstracts:')
    for getpapers_path in getpapers_paths:
        get_getpapers(ayears, getpapers_path)
    print()

# Transform into OrderedDict of DataFrames
ayears = OrderedDict( sorted( ayears.items() ) )
for year in ayears:
    ayears[year] = pandas.DataFrame( ayears[year]._asdict() )

# Tell us what we've got
print('## Abstract count for {}:'.format(opt_data_source))
for year in ayears:
    print(year, len(ayears[year]))
print('Total:', sum( map( len, ayears.values() ) ), '\n')

## Abstract paths:
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1991
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1992
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1993
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1994
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1995
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1996
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1997
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1998
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1999
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2000
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2001
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2002
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2003
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2004
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2005
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2006
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2007
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2008
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2009
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2010
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2011
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2012
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2013
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2014
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2015
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2016

## Defective abstracts:
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1997> missing "abstractText": PMC3950102, PMC3950105
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1998> missing "abstractText": PMC3950056
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2001> missing "abstractText": PMC4809519
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2002> missing "abstractText": PMC4052987
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2003> missing "abstractText": PMC3157308, PMC140764, PMC2933200
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2004> missing "abstractText": PMC2750770
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2007> missing "abstractText": PMC2409269, PMC1963418
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2008> missing "abstractText": PMC2605101, PMC2921443, PMC2577595, PMC2662631, PMC2910511, PMC2613431, PMC2637546, PMC2692037, PMC2597463, PMC2613188, PMC2742969, PMC2397515, PMC2613432, PMC2613189, PMC2613186, PMC2701509, PMC2646902
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2009> missing "abstractText": PMC2797682, PMC2827040, PMC3462220, PMC2998989, PMC4059389, PMC3050000, PMC4276027, PMC4828958, PMC4119170, PMC3085997, PMC2884369, PMC2721826, PMC3086398, PMC2792756, PMC2710299, PMC2927217, PMC2701308, PMC2940713, PMC2702716, PMC2762202, PMC2805406, PMC3432638
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2010> missing "abstractText": PMC3046437, PMC2854631, PMC3086479, PMC3727643, PMC2972547, PMC3532890, PMC4633045, PMC3086476, PMC3005737, PMC2831800, PMC3011974, PMC3891888, PMC2802109, PMC3069698, PMC2980856, PMC3910096
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2011> missing "abstractText": PMC4291023, PMC3070703, PMC3268011, PMC3154075, PMC3109560, PMC3173545, PMC3288213, PMC4438279, PMC3109583, PMC3533368, PMC2944904, PMC3109561, PMC3139707, PMC3181002, PMC3085082, PMC2993828, PMC3063418, PMC3285267, PMC3549592
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2012> missing "abstractText": PMC4105700, PMC4086916, PMC3459983, PMC3926433, PMC3618958, PMC4005330, PMC3384020, PMC4529025, PMC4816219, PMC3760188, PMC3285417, PMC4084672
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2013> missing "abstractText": PMC3919429, PMC4116686, PMC4010071, PMC4495903, PMC3960406, PMC3965749, PMC3728631, PMC3745823, PMC4084801, PMC4174342, PMC3782420, PMC3631073, PMC3628101, PMC3639538, PMC3978629, PMC3639537, PMC3639541, PMC3825305, PMC3755937, PMC3962824, PMC3845670, PMC3662856
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2014> missing "abstractText": PMC4423750, PMC4221801, PMC4422409, PMC4242865, PMC4176456, PMC4159115, PMC4282611, PMC4303193, PMC4303197, PMC4132248, PMC4132219, PMC4467453, PMC4076619, PMC4272910, PMC4828656, PMC4704099, PMC4169670, PMC3926795, PMC4137837, PMC4137836, PMC4147833, PMC3976704, PMC4303350, PMC4254381, PMC4167984, PMC3908999, PMC4560354, PMC4271782, PMC3887419, PMC4011400, PMC4081638, PMC4053154, PMC4328123, PMC4235676, PMC4052563, PMC3926784, PMC4243053, PMC4677672, PMC4053237, PMC3926792
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2015> missing "abstractText": PMC4691585, PMC4691584, PMC4691586, PMC4842308, PMC4510219, PMC4569197, PMC4569215, PMC4564307, PMC4477195, PMC4511377, PMC4822528, PMC4412310, PMC4589306, PMC4708077, PMC4503333, PMC4521403, PMC4309631, PMC4610139, PMC4507296, PMC4347900, PMC4603827
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2016> missing "abstractText": PMC4920648

## Abstract count for getpapersdir:
1991 153
1992 152
1993 191
1994 189
1995 219
1996 262
1997 289
1998 305
1999 304
2000 282
2001 339
2002 281
2003 347
2004 421
2005 480
2006 552
2007 812
2008 1499
2009 1951
2010 2349
2011 2704
2012 3321
2013 3785
2014 4324
2015 3976
2016 1090
Total: 30577

# Split words and do some cleaning
def linestowords(document):
    splitter = re.compile('[%=:;,"\'\(\)\[\]\{\}\<\>\.]+')
    words = [word
             for line in document
             for inter in line.casefold().split()
             for dirty in splitter.split(inter)
             for word in [dirty.strip('.-')]
             if len(word)>1 and not word.isdigit()]
    return words

for year in ayears:
    ayears[year]['abstracts'] = [ linestowords(abstract) for abstract in ayears[year]['abstracts'] ]
    ayears[year] = ayears[year][ ayears[year]['abstracts'].apply(bool) ]
    ayears[year].index = range( len( ayears[year] ) )

# Identify and join "bigrams" so ['são', 'paulo'] becomes ['são_paulo']
bigram = gensim.models.phrases.Phrases( [ abstract
                                          for year in ayears
                                          for abstract in ayears[year]['abstracts'] ] )

for year in ayears:
    ayears[year]['abstracts'] = [ bigram[abstract] for abstract in ayears[year]['abstracts'] ]

# Modeling time!
yearmodels=dict()

if opt_load_models:
    # Load existing models from saved files
    for year in ayears:
        model_path = '-'.join(['w2v', opt_model_name, str(year)])
        if os.path.exists( model_path ):
            yearmodels[year] = gensim.models.Word2Vec.load( model_path )

missing_years = set(ayears).difference( set(yearmodels) )

if missing_years:
    basemodel_path = '-'.join(['w2v', opt_model_name, 'base'])
    
    if os.path.exists( basemodel_path ):
        # Load existing basemodel
        basemodel = gensim.models.Word2Vec.load( basemodel_path )
    
    else:
        # Create the base model, hs=1 and negative=0 are required by .score()
        basemodel = gensim.models.Word2Vec(workers=multiprocessing.cpu_count(),
                                           iter = opt_model_iter,
                                           hs=1, negative=0
                                           )
        
        # Create a copy of the abstracts as the original is needed for analysis
        model_ayears = deepcopy(ayears)

        # Shuffle abstracts, because order matters in training
        for year in model_ayears:
            index = list(model_ayears[year].index)
            numpy.random.shuffle(index)
            model_ayears[year].index = index
            model_ayears[year].sort_index(inplace=True)

        # Balance abstracts if we chose to
        if opt_training_mode == 'nothing':
            pass
        elif opt_training_mode == 'randomfill':
            # Fill them up to match the largest corpus to see if this fixes the size bias
            maxsize = max( map( len, model_ayears.values() ) )
            for year in model_ayears:
                size = len(model_ayears[year])
                factor = int(maxsize/size)
                rest = maxsize - size*factor
                model_ayears[year] = pandas.concat( [ model_ayears[year] for i in range(factor) ] \
                                                    + [ model_ayears[year][:rest] ], ignore_index=True )
        elif opt_training_mode == 'randomsample':
            # Cut to same size to see if this fixes the size bias
            minsize = 1000 # min( map(len, model_ayears.values() )) -> 96 and 98 were strong
            for year in model_ayears:
                model_ayears[year] = model_ayears[year][:minsize]
        else:
            raise Exception('opt_training_mode not found: ' + opt_training_mode)

        # Build common vocabulary from all years
        basemodel.build_vocab( pandas.concat( [model_ayears[year]['abstracts'] for year in model_ayears],
                                              ignore_index=True ) )
        
        # Save the basemodel
        basemodel.save( basemodel_path )

    print(basemodel)

    # Create a model for each year and train it
    for year in sorted(missing_years):
        yearmodels[year] = deepcopy(basemodel)
        yearmodels[year].train(model_ayears[year]['abstracts'], total_examples=len(model_ayears[year]))
        yearmodels[year].save( '-'.join(['w2v', opt_model_name, str(year)]) )

def calc_probs(ayears, yearmodels):
    for year_a in ayears:
        # the log likelihood of each sentence in this review under each w2v representation
        llhd = pandas.DataFrame( ( numpy.float128(yearmodels[year_m].score( ayears[year_a]['abstracts'],
                                                                            len(ayears[year_a]) ) )
                                   for year_m in ayears ),
                                 index=list(ayears) )
        # now exponentiate to get likelihoods, 
        lhd = numpy.exp(llhd - llhd[llhd.index!=year_a].max(axis=0)) # subtract row max to avoid numeric overload
        # normalize across models (years) to get sentence-year probabilities
        plhd = (lhd/lhd[lhd.index!=year_a].sum(axis=0))
        # add probabilities to our yearly dataframe
        llhd.index = ['l'+str(c) for c in llhd.index]
        plhd.index = ['p'+str(c) for c in plhd.index]
        ayears[year_a] = pandas.concat( [ayears[year_a], llhd.T, plhd.T], axis=1)

pi = lambda x: ['p'+str(y) for y in x] if isinstance(x, list) else 'p'+str(x)
li = lambda x: ['l'+str(y) for y in x] if isinstance(x, list) else 'l'+str(x)

calc_probs(ayears, yearmodels)

def gen_plots(ayears):
    probs = [pi(year) for year in ayears]

    for year in ayears:
        ayears[year]['year']=year
    allyears = pandas.concat( [ayears[year] for year in ayears] )
    for year in ayears:
        allyears.loc[allyears.year==year, pi(year)] = numpy.NaN

    # Each subplot is the model trained from that year crossed with the corpus for each year
    axs = allyears.boxplot( probs, by='year', whis=[5,95], showmeans=1, layout=(9,3),
                          figsize=(17,17), return_type='axes')
    for ax in axs.values():
        ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
    plt.title('Models by data')

    # Each subplot is the corpus for that year crossed with each of the models
    plt.figure(figsize=(17,17))
    for i, target in enumerate( ayears ):
        plt.subplot(9, 3, i+1)
        ax = allyears.loc[allyears.year==target].boxplot(probs, whis=[5,95], showmeans=1, return_type='axes')
        ax.set_title(str(target))
        ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
    plt.title('Data by models')
    plt.tight_layout()

    # Histogram of abstracts per year
    plt.figure(figsize=(17,10))
    plt.bar(ayears, [len(ayears[years]) for years in ayears])
    plt.title('Number of abstracts per year')

gen_plots(ayears)

def calc_lratios(ayears):
    # Calculate the ratio between an abstracts likelihood in its own year and on the next
    nextyear = dict( (y, list(ayears)[i+1]) for i, y in enumerate(list(ayears)[:-1]) )
    prevyear = dict( (y, list(ayears)[i-1]) for i, y in enumerate(list(ayears)[1:]) )
    nextyears = dict( (y, list(ayears)[i+1:]) for i, y in enumerate(list(ayears)[:-1]) )
    prevyears = dict( (y, list(ayears)[i-1:]) for i, y in enumerate(list(ayears)[1:]) )

    xi=li
    def lrsavg(ayears, year, target):
        return ayears[year].loc[ :, xi(target) ].sub(
            ayears[year].loc[ :, xi(target) ].mean(axis=0), axis=1 if isinstance(target, list) else 0 )
    
    for year in list(ayears):
        ayears[year]['lrsavg'] = lrsavg(ayears, year, year)
    for year in list(ayears)[:-1]:
        ayears[year]['lrnext'] = ayears[year].loc[ :, xi(nextyear[year]) ] - ayears[year].loc[ :, xi(year) ]
        ayears[year]['lrsavgnext'] = lrsavg(ayears, year, nextyear[year]).sub(lrsavg(ayears, year, year),axis=0)
        ayears[year]['lrnexts'] = ayears[year].loc[ :, xi(nextyears[year]) ].sub(
                                    ayears[year].loc[ :, xi(year) ], axis=0 ).sum(axis=1)
        ayears[year]['lrsavgnexts'] = lrsavg(ayears, year, nextyears[year]).sub(
                                    lrsavg(ayears, year, year), axis=0 ).sum(axis=1)
        ayears[year]['lrsavgnextmax'] = lrsavg(ayears, year, nextyears[year]).sub(
                                    lrsavg(ayears, year, year), axis=0 ).max(axis=1)
    for year in list(ayears)[1:]:
        ayears[year]['lrprev'] = ayears[year].loc[ :, xi(year) ] - ayears[year].loc[ :, xi(prevyear[year]) ]
    for year in list(ayears)[1:-1]:
        ayears[year]['lrjump'] = ayears[year].loc[ :, xi(nextyear[year]) ] - \
                                 ayears[year].loc[ :, xi(prevyear[year]) ]

def find_lratios(ayear, year, ratio='lrjump'):
    # Find the 10 largest ratios for a given year
    return ayears[year][ratio].sort_values(ascending=False)[:10]

def print_lratios(ayears):
    for year in list(ayears)[1:-1]:
        print(year, '\n', find_lratios(ayears, year), '\n')

calc_lratios(ayears)

def print_lcorr(ayears, ratio='lrjump'):
    for year in list(ayears)[1:-1]:
        print('\n\n', year, '\n')
        cyears = ayears[year][[ratio,'citations']]
        print('\n\npearson\n', cyears.corr())
        print('\n\kendall\n', cyears.corr('kendall'))
        print('\n\spearman\n', cyears.corr('spearman'))

def plot_lcorr(ayears, ratio='lrjump'):
    years = list(ayears)[1:-1]
    plt.figure(figsize=(17, len(years)*3))
    for i, target in enumerate( years ):
        ayear = ayears[target]
        ax = plt.subplot(int(len(years)/2)+1, 2, i+1)
        ax.set_title(ratio+' '+str(target))
        ax.scatter(ayear['citations'], ayear[ratio])
        r=ayear[['citations', ratio]].sort_values('citations').rolling(window=50).agg({'citations':lambda x: x[-1], ratio:'mean'})
        ax.plot(r['citations'], r[ratio], color='red')
        ax.axhline()
    plt.tight_layout()

plot_lcorr(ayears, 'lrsavgnext')
plot_lcorr(ayears, 'lrsavgnexts')
plot_lcorr(ayears, 'lrsavgnextmax')

# Some inspection functions

def readindex(year, index):
    # Reads the abstract with given index for the year given above
    print( ' '.join(ayears[year]['abstracts'][index]) )

def wordinfo(word):
    # Print count and similar words for a given word
    for year in ayears:
        m=yearmodels[year]
        print(year)
        print('Count: ', m.vocab[word].count)
        print(m.most_similar(word, topn=3))

# Check if some abstracts are too small or too big
def list_abstracts_with_length(ayears, low=100, high=sys.maxsize):
    for year in ayears:
        selected=[]
        for i, item in enumerate(ayears[year]['abstracts']):
            if len(item) < low or len(item)>high:
                selected.append((i,item))
        print('len', year, [ len(x[1]) for x in selected])
        print('idx', year, [ x[0] for x in selected])
        print()

def wordstuff():
    # Creates a list of words sorted by their total count
    words = [ w for w, c in sorted(basemodel.vocab.items(), key=lambda x: x[1], reverse=True) ]

    # Log plot of the word count in the order set above
    plt.bar( [i for i in range(len(words))], numpy.log([basemodel.vocab[w].count for w in words]) )

    # Prints some words with extreme counts:
    for w in words[:10]+words[-10:]:
        print(w, basemodel.vocab[w].count)

Study of the temporal evolution of a corpus through word embedding¶

Author(s):¶

License:¶

Project:¶

Reference repository for this file:¶