Study of the temporal evolution of a corpus through word embedding

Author(s):
License:

GNU-GPLv3+

Project:

https://en.wikiversity.org/wiki/The_dynamics_and_social_organization_of_innovation_in_the_field_of_oncology

Reference repository for this file:

https://gitlab.com/solstag/oncologister-pvt

Contributions are welcome, get in touch with the author(s).

In [1]:
# This notebook requires Pandas, Gensim and Lxml
import re, multiprocessing, numpy, json, os, sys, math
import pandas, gensim
from collections import defaultdict, namedtuple, OrderedDict
from copy import deepcopy
import matplotlib.pyplot as plt
from lxml import html
%matplotlib inline
In [2]:
# Options used to run the notebook
opt_your_initials = 'aha' # SET THIS BEFORE RUNNING, i.e. 'aha', 'jpc', ...
opt_data_source = 'getpapersdir'
opt_training_mode = 'randomfill'
opt_model_iter = 5
opt_name_extra = ''
opt_model_name = '-'.join( [opt_your_initials, opt_data_source, opt_training_mode, str(opt_model_iter)] )
opt_model_name += '-' + opt_name_extra if opt_name_extra else ''
opt_load_models = True
In [3]:
# Source abstracts from JSON files generated by esmo.py
def get_esmo(ayears, esmo_path):
    # get converted PDF files
    for year in os.listdir(esmo_path):
        year_path = os.path.join(esmo_path,year,'xml')
        for sec in os.listdir(year_path):
            if sec.endswith('.json'):
                sec_path = os.path.join(year_path,sec)
                with open(sec_path) as sec_file:
                    content = json.load(sec_file)
                ayears[int(year)].abstracts.extend( [ l['text'] for l in item['abstract'] ]
                                                    for item in content if item['separator']!=None )
    # get converted HTML files
    for year in os.listdir(esmo_path):
        year_path = os.path.join(esmo_path,year,'content')
        for sec in os.listdir(year_path):
            if sec.endswith('.json'):
                sec_path = os.path.join(year_path,sec)
                with open(sec_path) as sec_file:
                    content = json.load(sec_file)
                for a in content['abstracts']:
                    htext = html.fromstring(a['text'])
                    htext.find('h2').getparent().remove(htext.find('h2'))
                    abstract = [ a['title'],
                                 ' '.join(a['authors'].keys()),
                                 ' '.join(sum(a['authors'].values(),[])),
                                 htext.text_content() ]
                    ayears[int(year)].abstracts.append( abstract )

# Source abstracts from JSON files generated by ContentMine's getpapers.js
def get_getpapers(ayears, getpapers_path):
    names = [n for n in os.listdir(getpapers_path) if os.path.isdir(os.path.join(getpapers_path, n))]
    noabstract = []
    for name in names:
        with open(os.path.join(getpapers_path, name, 'eupmc_result.json')) as f:
            content = json.load(f)
        try:
            year = int(content["journalInfo"][0]["yearOfPublication"][0])
            abstract = content["abstractText"][0]
            pmid = content["id"][0]
            citation = int(content["citedByCount"][0])    
            title = content["title"][0]
            if any( map( lambda x: len(x)!=1,
                         (content["abstractText"], content["id"], content["citedByCount"],
                          content["journalInfo"][0]["yearOfPublication"]) ) ):
                raise Exception('Non-unique item!')
            if not len(abstract):
                raise Exception('Empty abstract!')
            ayears[year].abstracts.append([abstract])
            ayears[year].pmids.append(pmid)
            ayears[year].citations.append(citation)
            ayears[year].titles.append(title)
        except KeyError:
            noabstract.append(name)
    if noabstract:
        print('In <'+getpapers_path+'> missing "abstractText":', ', '.join(noabstract))
In [4]:
# Create the main data structure
ArticleSeries = namedtuple('Articles', ['abstracts', 'pmids', 'citations','titles'])
ayears = defaultdict(lambda: ArticleSeries([], [], [], []))

# ESMO abstracts from esmo.py
if opt_data_source=='esmo':
    esmo_path='../data/esmo/auto_esmo'
    get_esmo(ayears, esmo_path)

# Getpapers single dir
if opt_data_source=='getpaperssingle':
    getpapers_path = '/home/aedesaegypti/contentmine/papers/diarrhea_angola_orig/'
    get_getpapers(ayears, getpapers_path)

# Getpapers dir store
if opt_data_source=='getpapersdir':

    getpapers_store = '../data/gotpapers/'
    getpapers_re    = re.compile('gp-mesh_Breast_Neoplasms-research_article-en-')
    getpapers_paths = [os.path.join(getpapers_store, name)
                         for name in os.listdir(getpapers_store)
                           if os.path.isdir(os.path.join(getpapers_store, name)) \
                             and getpapers_re.match(name)]
    # Tell us where we'll look for papers
    print('## Abstract paths:')
    for path in getpapers_paths:
        print(path)
    print()

    print('## Defective abstracts:')
    for getpapers_path in getpapers_paths:
        get_getpapers(ayears, getpapers_path)
    print()

# Transform into OrderedDict of DataFrames
ayears = OrderedDict( sorted( ayears.items() ) )
for year in ayears:
    ayears[year] = pandas.DataFrame( ayears[year]._asdict() )

# Tell us what we've got
print('## Abstract count for {}:'.format(opt_data_source))
for year in ayears:
    print(year, len(ayears[year]))
print('Total:', sum( map( len, ayears.values() ) ), '\n')
## Abstract paths:
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1991
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1992
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1993
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1994
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1995
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1996
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1997
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1998
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1999
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2000
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2001
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2002
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2003
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2004
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2005
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2006
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2007
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2008
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2009
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2010
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2011
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2012
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2013
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2014
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2015
../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2016

## Defective abstracts:
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1997> missing "abstractText": PMC3950102, PMC3950105
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-1998> missing "abstractText": PMC3950056
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2001> missing "abstractText": PMC4809519
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2002> missing "abstractText": PMC4052987
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2003> missing "abstractText": PMC3157308, PMC140764, PMC2933200
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2004> missing "abstractText": PMC2750770
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2007> missing "abstractText": PMC2409269, PMC1963418
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2008> missing "abstractText": PMC2605101, PMC2921443, PMC2577595, PMC2662631, PMC2910511, PMC2613431, PMC2637546, PMC2692037, PMC2597463, PMC2613188, PMC2742969, PMC2397515, PMC2613432, PMC2613189, PMC2613186, PMC2701509, PMC2646902
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2009> missing "abstractText": PMC2797682, PMC2827040, PMC3462220, PMC2998989, PMC4059389, PMC3050000, PMC4276027, PMC4828958, PMC4119170, PMC3085997, PMC2884369, PMC2721826, PMC3086398, PMC2792756, PMC2710299, PMC2927217, PMC2701308, PMC2940713, PMC2702716, PMC2762202, PMC2805406, PMC3432638
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2010> missing "abstractText": PMC3046437, PMC2854631, PMC3086479, PMC3727643, PMC2972547, PMC3532890, PMC4633045, PMC3086476, PMC3005737, PMC2831800, PMC3011974, PMC3891888, PMC2802109, PMC3069698, PMC2980856, PMC3910096
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2011> missing "abstractText": PMC4291023, PMC3070703, PMC3268011, PMC3154075, PMC3109560, PMC3173545, PMC3288213, PMC4438279, PMC3109583, PMC3533368, PMC2944904, PMC3109561, PMC3139707, PMC3181002, PMC3085082, PMC2993828, PMC3063418, PMC3285267, PMC3549592
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2012> missing "abstractText": PMC4105700, PMC4086916, PMC3459983, PMC3926433, PMC3618958, PMC4005330, PMC3384020, PMC4529025, PMC4816219, PMC3760188, PMC3285417, PMC4084672
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2013> missing "abstractText": PMC3919429, PMC4116686, PMC4010071, PMC4495903, PMC3960406, PMC3965749, PMC3728631, PMC3745823, PMC4084801, PMC4174342, PMC3782420, PMC3631073, PMC3628101, PMC3639538, PMC3978629, PMC3639537, PMC3639541, PMC3825305, PMC3755937, PMC3962824, PMC3845670, PMC3662856
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2014> missing "abstractText": PMC4423750, PMC4221801, PMC4422409, PMC4242865, PMC4176456, PMC4159115, PMC4282611, PMC4303193, PMC4303197, PMC4132248, PMC4132219, PMC4467453, PMC4076619, PMC4272910, PMC4828656, PMC4704099, PMC4169670, PMC3926795, PMC4137837, PMC4137836, PMC4147833, PMC3976704, PMC4303350, PMC4254381, PMC4167984, PMC3908999, PMC4560354, PMC4271782, PMC3887419, PMC4011400, PMC4081638, PMC4053154, PMC4328123, PMC4235676, PMC4052563, PMC3926784, PMC4243053, PMC4677672, PMC4053237, PMC3926792
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2015> missing "abstractText": PMC4691585, PMC4691584, PMC4691586, PMC4842308, PMC4510219, PMC4569197, PMC4569215, PMC4564307, PMC4477195, PMC4511377, PMC4822528, PMC4412310, PMC4589306, PMC4708077, PMC4503333, PMC4521403, PMC4309631, PMC4610139, PMC4507296, PMC4347900, PMC4603827
In <../data/gotpapers/gp-mesh_Breast_Neoplasms-research_article-en-2016> missing "abstractText": PMC4920648

## Abstract count for getpapersdir:
1991 153
1992 152
1993 191
1994 189
1995 219
1996 262
1997 289
1998 305
1999 304
2000 282
2001 339
2002 281
2003 347
2004 421
2005 480
2006 552
2007 812
2008 1499
2009 1951
2010 2349
2011 2704
2012 3321
2013 3785
2014 4324
2015 3976
2016 1090
Total: 30577 

In [5]:
# Split words and do some cleaning
def linestowords(document):
    splitter = re.compile('[%=:;,"\'\(\)\[\]\{\}\<\>\.]+')
    words = [word
             for line in document
             for inter in line.casefold().split()
             for dirty in splitter.split(inter)
             for word in [dirty.strip('.-')]
             if len(word)>1 and not word.isdigit()]
    return words

for year in ayears:
    ayears[year]['originals'] = ayears[year]['abstracts']
    ayears[year]['abstracts'] = [ linestowords(abstract) for abstract in ayears[year]['abstracts'] ]
    ayears[year] = ayears[year][ ayears[year]['abstracts'].apply(bool) ]
    ayears[year].index = range( len( ayears[year] ) )

# Identify and join "bigrams" so ['são', 'paulo'] becomes ['são_paulo']
bigram = gensim.models.phrases.Phrases( [ abstract
                                          for year in ayears
                                          for abstract in ayears[year]['abstracts'] ] )

for year in ayears:
    ayears[year]['abstracts'] = [ bigram[abstract] for abstract in ayears[year]['abstracts'] ]
In [6]:
# Modeling time!
yearmodels=dict()

if opt_load_models:
    # Load existing models from saved files
    for year in ayears:
        model_path = '-'.join(['w2v', opt_model_name, str(year)])
        if os.path.exists( model_path ):
            yearmodels[year] = gensim.models.Word2Vec.load( model_path )

missing_years = set(ayears).difference( set(yearmodels) )

if missing_years:
    basemodel_path = '-'.join(['w2v', opt_model_name, 'base'])
    
    if os.path.exists( basemodel_path ):
        # Load existing basemodel
        basemodel = gensim.models.Word2Vec.load( basemodel_path )
    
    else:
        # Create the base model, hs=1 and negative=0 are required by .score()
        basemodel = gensim.models.Word2Vec(workers=multiprocessing.cpu_count(),
                                           iter = opt_model_iter,
                                           hs=1, negative=0
                                           )
        
        # Create a copy of the abstracts as the original is needed for analysis
        model_ayears = deepcopy(ayears)

        # Shuffle abstracts, because order matters in training
        for year in model_ayears:
            index = list(model_ayears[year].index)
            numpy.random.shuffle(index)
            model_ayears[year].index = index
            model_ayears[year].sort_index(inplace=True)

        # Balance abstracts if we chose to
        if opt_training_mode == 'nothing':
            pass
        elif opt_training_mode == 'randomfill':
            # Fill them up to match the largest corpus to see if this fixes the size bias
            maxsize = max( map( len, model_ayears.values() ) )
            for year in model_ayears:
                size = len(model_ayears[year])
                factor = int(maxsize/size)
                rest = maxsize - size*factor
                model_ayears[year] = pandas.concat( [ model_ayears[year] for i in range(factor) ] \
                                                    + [ model_ayears[year][:rest] ], ignore_index=True )
        elif opt_training_mode == 'randomsample':
            # Cut to same size to see if this fixes the size bias
            minsize = 1000 # min( map(len, model_ayears.values() )) -> 96 and 98 were strong
            for year in model_ayears:
                model_ayears[year] = model_ayears[year][:minsize]
        else:
            raise Exception('opt_training_mode not found: ' + opt_training_mode)

        # Build common vocabulary from all years
        basemodel.build_vocab( pandas.concat( [model_ayears[year]['abstracts'] for year in model_ayears],
                                              ignore_index=True ) )
        
        # Save the basemodel
        basemodel.save( basemodel_path )

    print(basemodel)

    # Create a model for each year and train it
    for year in sorted(missing_years):
        yearmodels[year] = deepcopy(basemodel)
        yearmodels[year].train(model_ayears[year]['abstracts'], total_examples=len(model_ayears[year]))
        yearmodels[year].save( '-'.join(['w2v', opt_model_name, str(year)]) )
In [7]:
def calc_probs(ayears, yearmodels):
    for year_a in ayears:
        # the log likelihood of each sentence in this review under each w2v representation
        llhd = pandas.DataFrame( ( numpy.float128(yearmodels[year_m].score( ayears[year_a]['abstracts'],
                                                                            len(ayears[year_a]) ) )
                                   for year_m in ayears ),
                                 index=list(ayears) )
        # now exponentiate to get likelihoods, 
        lhd = numpy.exp(llhd - llhd[llhd.index!=year_a].max(axis=0)) # subtract row max to avoid numeric overload
        # normalize across models (years) to get sentence-year probabilities
        plhd = (lhd/lhd[lhd.index!=year_a].sum(axis=0))
        # add probabilities to our yearly dataframe
        llhd.index = ['l'+str(c) for c in llhd.index]
        plhd.index = ['p'+str(c) for c in plhd.index]
        ayears[year_a] = pandas.concat( [ayears[year_a], llhd.T, plhd.T], axis=1)
In [8]:
pi = lambda x: ['p'+str(y) for y in x] if isinstance(x, list) else 'p'+str(x)
li = lambda x: ['l'+str(y) for y in x] if isinstance(x, list) else 'l'+str(x)
In [9]:
calc_probs(ayears, yearmodels)
In [10]:
def gen_plots(ayears):
    probs = [pi(year) for year in ayears]

    for year in ayears:
        ayears[year]['year']=year
    allyears = pandas.concat( [ayears[year] for year in ayears] )
    for year in ayears:
        allyears.loc[allyears.year==year, pi(year)] = numpy.NaN

    # Each subplot is the model trained from that year crossed with the corpus for each year
    axs = allyears.boxplot( probs, by='year', whis=[5,95], showmeans=1, layout=(9,3),
                          figsize=(17,17), return_type='axes')
    for ax in axs.values():
        ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
    plt.title('Models by data')

    # Each subplot is the corpus for that year crossed with each of the models
    plt.figure(figsize=(17,17))
    for i, target in enumerate( ayears ):
        plt.subplot(9, 3, i+1)
        ax = allyears.loc[allyears.year==target].boxplot(probs, whis=[5,95], showmeans=1, return_type='axes')
        ax.set_title(str(target))
        ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
    plt.title('Data by models')
    plt.tight_layout()

    # Histogram of abstracts per year
    plt.figure(figsize=(17,10))
    plt.bar(ayears, [len(ayears[years]) for years in ayears])
    plt.title('Number of abstracts per year')
In [11]:
#gen_plots(ayears)
In [12]:
def calc_lratios(ayears):
    # Calculate the ratio between an abstracts likelihood in its own year and on the next
    nextyear = dict( (y, list(ayears)[i+1]) for i, y in enumerate(list(ayears)[:-1]) )
    prevyear = dict( (y, list(ayears)[i-1]) for i, y in enumerate(list(ayears)[1:]) )
    nextyears = dict( (y, list(ayears)[i+1:]) for i, y in enumerate(list(ayears)[:-1]) )
    prevyears = dict( (y, list(ayears)[i-1:]) for i, y in enumerate(list(ayears)[1:]) )

    xi=li
    def lrsavg(ayears, year, target):
        return ayears[year].loc[ :, xi(target) ].sub(
            ayears[year].loc[ :, xi(target) ].mean(axis=0), axis=1 if isinstance(target, list) else 0 )
    
    for year in list(ayears):
        ayears[year]['lrsavg'] = lrsavg(ayears, year, year)
    for year in list(ayears)[:-1]:
        ayears[year]['lrnext'] = ayears[year].loc[ :, xi(nextyear[year]) ] - ayears[year].loc[ :, xi(year) ]
        ayears[year]['lrsavgnext'] = lrsavg(ayears, year, nextyear[year]).sub(lrsavg(ayears, year, year),axis=0)
        ayears[year]['lrnexts'] = ayears[year].loc[ :, xi(nextyears[year]) ].sub(
                                    ayears[year].loc[ :, xi(year) ], axis=0 ).sum(axis=1)
        ayears[year]['lrsavgnexts'] = lrsavg(ayears, year, nextyears[year]).sub(
                                    lrsavg(ayears, year, year), axis=0 ).sum(axis=1)
        ayears[year]['lrsavgnextmax'] = lrsavg(ayears, year, nextyears[year]).sub(
                                    lrsavg(ayears, year, year), axis=0 ).max(axis=1)
    for year in list(ayears)[1:]:
        ayears[year]['lrprev'] = ayears[year].loc[ :, xi(year) ] - ayears[year].loc[ :, xi(prevyear[year]) ]
    for year in list(ayears)[1:-1]:
        ayears[year]['lrjump'] = ayears[year].loc[ :, xi(nextyear[year]) ] - \
                                 ayears[year].loc[ :, xi(prevyear[year]) ]

def find_lratios(ayear, year, ratio='lrjump'):
    # Find the 10 largest ratios for a given year
    return ayears[year][ratio].sort_values(ascending=False)[:10]

def print_lratios(ayears):
    for year in list(ayears)[1:-1]:
        print(year, '\n', find_lratios(ayears, year), '\n')
In [13]:
calc_lratios(ayears)
In [78]:
def print_lcorr(ayears, ratio='lrjump'):
    for year in list(ayears)[1:-1]:
        print('\n\n', year, '\n')
        cyears = ayears[year][[ratio,'citations']]
        print('\n\npearson\n', cyears.corr())
        print('\n\kendall\n', cyears.corr('kendall'))
        print('\n\spearman\n', cyears.corr('spearman'))

def plot_lcorr(ayears, ratio='lrjump'):
    years = list(ayears)[-11:-1]
    plt.figure(figsize=(17, len(years)*3))
    for i, target in enumerate( years ):
        ayear = ayears[target]
        ax = plt.subplot(int(len(years)/2)+1, 2, i+1)
        ax.set_title(ratio+' '+str(target))
        ax.scatter(ayear['citations'], ayear[ratio])
        bins = numpy.logspace( numpy.log10(ayear.citations.min()+1),
                               numpy.log10(ayear.citations.max()+1.2) ) - 1.1
        ayear['citations-bin']=pandas.cut(ayear.citations, bins, labels=bins[1:])
        r=ayear[['citations-bin', ratio]].groupby('citations-bin').mean()[ratio]
        ax.plot(numpy.array(r.index), r, color='red', linewidth=2)
        ax.axhline()
        ax.set_xscale('log')
        plt.xlim(xmin=0.8)
    plt.tight_layout()
In [79]:
plot_lcorr(ayears, 'lrsavgnext')
plot_lcorr(ayears, 'lrsavgnexts')
plot_lcorr(ayears, 'lrsavgnextmax')
In [82]:
def pick_lcorr(ayears, year, ratio='lrsavgnexts'):
    ayear = ayears[year]
    low=ayear.sort_values(ratio)[:10]
    high=ayear.sort_values(ratio)[-10:]
#    print('Low {}:\n'.format(ratio), low.pmids)
#    print('High {}:\n'.format(ratio), high.pmids)
    mixed = numpy.array(pandas.concat([high.pmids, low.pmids]))
    numpy.random.shuffle(mixed)
    print('Mixed {}:\n'.format(ratio), mixed)
    items = ayear.loc[ [x in mixed for x in ayear.pmids] , ['pmids','titles','originals'] ]
    for i,p,t,o in items.itertuples():
        print(p,'\n')
        print(t,'\n')
        print(o[0],'\n\n')

pick_lcorr(ayears, 2012, 'lrsavgnexts')
Mixed lrsavgnexts:
 ['22755704' '23231306' '22226178' '22293080' '21839055' '23170291'
 '22673183' '23076930' '22992387' '22919559' '22302033' '22363415'
 '22647525' '23052257' '22320800' '22493417' '22578566' '22534285'
 '22621279' '23425423']
21839055 

Curcumin-loaded γ-cyclodextrin liposomal nanoparticles as delivery vehicles for osteosarcoma. 

the 


23170291 

Treatment outcome in patients with triple negative early stage breast cancers compared with other molecular subtypes. 

purpose 


22919559 

CAD May Not be Necessary for Microcalcifications in the Digital era, CAD May Benefit Radiologists for Masses. 

objective 


23052257 

Real-time pathology to guide breast surgery: seeing alone is not believing. 

tissue 


22534285 

Coexistence of benign phyllodes tumor and invasive ductal carcinoma in distinct breasts: case report. 

this 


23425423 

Needles in a haystack: finding recurrent genomic changes in breast cancer. 

significant 


22293080 

Worldwide variations in EGFR somatic mutations: a challenge for personalized medicine. 

two 


23231306 

Implementation and evaluation of an expectation maximization reconstruction algorithm for gamma emission breast tomosynthesis. 

purpose 


22755704 

Effect of image quality on calcification detection in digital mammography. 

purpose 


23076930 

Local anaesthetics and regional anaesthesia for preventing chronic pain after surgery. 

background 


22320800 

Adaptation of a clustered lumpy background model for task-based image quality assessment in x-ray phase-contrast mammography. 

purpose 


22992387 

NOTCH1 inhibition in vivo results in mammary tumor regression and reduced mammary tumorsphere-forming activity in vitro. 

introduction 


22363415 

Differential expression profile and genetic variants of microRNAs sequences in breast cancer patients. 

the 


22226178 

Pattern of distant recurrence according to the molecular subtypes in Korean women with breast cancer. 

background 


22647525 

A gene-protein assay for human epidermal growth factor receptor 2 (HER2): brightfield tricolor visualization of HER2 protein, the HER2 gene, and chromosome 17 centromere (CEN17) in formalin-fixed, paraffin-embedded breast cancer tissue sections. 

background 


22621279 

Die and let live: harnessing BikDD to combat breast cancer stem cells. 

one 


22673183 

Invasive breast cancer induces laminin-332 upregulation and integrin β4 neoexpression in myofibroblasts to confer an anoikis-resistant phenotype during tissue remodeling. 

introduction 


22493417 

Quality of breast cancer care: perception versus practice. 

purpose 


22578566 

Oncosuppressive role of p53-induced miR-205 in triple negative breast cancer. 

an 


22302033 

Multiple roles of cyclin-dependent kinase 4/6 inhibitors in cancer therapy. 

background 


In [17]:
# Some inspection functions

def readindex(year, index):
    # Reads the abstract with given index for the year given above
    print( ' '.join(ayears[year]['abstracts'][index]) )

def wordinfo(word):
    # Print count and similar words for a given word
    for year in ayears:
        m=yearmodels[year]
        print(year)
        print('Count: ', m.vocab[word].count)
        print(m.most_similar(word, topn=3))

# Check if some abstracts are too small or too big
def list_abstracts_with_length(ayears, low=100, high=sys.maxsize):
    for year in ayears:
        selected=[]
        for i, item in enumerate(ayears[year]['abstracts']):
            if len(item) < low or len(item)>high:
                selected.append((i,item))
        print('len', year, [ len(x[1]) for x in selected])
        print('idx', year, [ x[0] for x in selected])
        print()

def wordstuff():
    # Creates a list of words sorted by their total count
    words = [ w for w, c in sorted(basemodel.vocab.items(), key=lambda x: x[1], reverse=True) ]

    # Log plot of the word count in the order set above
    plt.bar( [i for i in range(len(words))], numpy.log([basemodel.vocab[w].count for w in words]) )

    # Prints some words with extreme counts:
    for w in words[:10]+words[-10:]:
        print(w, basemodel.vocab[w].count)
In [ ]: