https://gitlab.com/solstag/oncologister-pvt
Contributions are welcome, get in touch with the author(s).
# This notebook requires Pandas, Gensim and Lxml
import re, multiprocessing, numpy, json, os, sys, math
import pandas, gensim
from collections import defaultdict, namedtuple, OrderedDict
from copy import deepcopy
import matplotlib.pyplot as plt
from lxml import html
%matplotlib inline
# Options used to run the notebook
opt_your_initials = 'aha' # SET THIS BEFORE RUNNING, i.e. 'aha', 'jpc', ...
opt_data_source = 'getpapersdir'
opt_training_mode = 'randomfill'
opt_model_iter = 5
opt_name_extra = ''
opt_model_name = '-'.join( [opt_your_initials, opt_data_source, opt_training_mode, str(opt_model_iter)] )
opt_model_name += '-' + opt_name_extra if opt_name_extra else ''
opt_load_models = True
# Source abstracts from JSON files generated by esmo.py
def get_esmo(ayears, esmo_path):
# get converted PDF files
for year in os.listdir(esmo_path):
year_path = os.path.join(esmo_path,year,'xml')
for sec in os.listdir(year_path):
if sec.endswith('.json'):
sec_path = os.path.join(year_path,sec)
with open(sec_path) as sec_file:
content = json.load(sec_file)
ayears[int(year)].abstracts.extend( [ l['text'] for l in item['abstract'] ]
for item in content if item['separator']!=None )
# get converted HTML files
for year in os.listdir(esmo_path):
year_path = os.path.join(esmo_path,year,'content')
for sec in os.listdir(year_path):
if sec.endswith('.json'):
sec_path = os.path.join(year_path,sec)
with open(sec_path) as sec_file:
content = json.load(sec_file)
for a in content['abstracts']:
htext = html.fromstring(a['text'])
htext.find('h2').getparent().remove(htext.find('h2'))
abstract = [ a['title'],
' '.join(a['authors'].keys()),
' '.join(sum(a['authors'].values(),[])),
htext.text_content() ]
ayears[int(year)].abstracts.append( abstract )
# Source abstracts from JSON files generated by ContentMine's getpapers.js
def get_getpapers(ayears, getpapers_path):
names = [n for n in os.listdir(getpapers_path) if os.path.isdir(os.path.join(getpapers_path, n))]
noabstract = []
for name in names:
with open(os.path.join(getpapers_path, name, 'eupmc_result.json')) as f:
content = json.load(f)
try:
year = int(content["journalInfo"][0]["yearOfPublication"][0])
abstract = content["abstractText"][0]
pmid = content["id"][0]
citation = int(content["citedByCount"][0])
if any( map( lambda x: len(x)!=1,
(content["abstractText"], content["id"], content["citedByCount"],
content["journalInfo"][0]["yearOfPublication"]) ) ):
raise Exception('Non-unique item!')
if not len(abstract):
raise Exception('Empty abstract!')
ayears[year].abstracts.append([abstract])
ayears[year].pmids.append(pmid)
ayears[year].citations.append(citation)
except KeyError:
noabstract.append(name)
if noabstract:
print('In <'+getpapers_path+'> missing "abstractText":', ', '.join(noabstract))
# Create the main data structure
ArticleSeries = namedtuple('Articles', ['abstracts', 'pmids', 'citations'])
ayears = defaultdict(lambda: ArticleSeries([], [], []))
# ESMO abstracts from esmo.py
if opt_data_source=='esmo':
esmo_path='../data/esmo/auto_esmo'
get_esmo(ayears, esmo_path)
# Getpapers single dir
if opt_data_source=='getpaperssingle':
getpapers_path = '/home/aedesaegypti/contentmine/papers/diarrhea_angola_orig/'
get_getpapers(ayears, getpapers_path)
# Getpapers dir store
if opt_data_source=='getpapersdir':
getpapers_store = '../data/gotpapers/'
getpapers_re = re.compile('gp-mesh_Breast_Neoplasms-research_article-en-')
getpapers_paths = [os.path.join(getpapers_store, name)
for name in os.listdir(getpapers_store)
if os.path.isdir(os.path.join(getpapers_store, name)) \
and getpapers_re.match(name)]
# Tell us where we'll look for papers
print('## Abstract paths:')
for path in getpapers_paths:
print(path)
print()
print('## Defective abstracts:')
for getpapers_path in getpapers_paths:
get_getpapers(ayears, getpapers_path)
print()
# Transform into OrderedDict of DataFrames
ayears = OrderedDict( sorted( ayears.items() ) )
for year in ayears:
ayears[year] = pandas.DataFrame( ayears[year]._asdict() )
# Tell us what we've got
print('## Abstract count for {}:'.format(opt_data_source))
for year in ayears:
print(year, len(ayears[year]))
print('Total:', sum( map( len, ayears.values() ) ), '\n')
# Split words and do some cleaning
def linestowords(document):
splitter = re.compile('[%=:;,"\'\(\)\[\]\{\}\<\>\.]+')
words = [word
for line in document
for inter in line.casefold().split()
for dirty in splitter.split(inter)
for word in [dirty.strip('.-')]
if len(word)>1 and not word.isdigit()]
return words
for year in ayears:
ayears[year]['abstracts'] = [ linestowords(abstract) for abstract in ayears[year]['abstracts'] ]
ayears[year] = ayears[year][ ayears[year]['abstracts'].apply(bool) ]
ayears[year].index = range( len( ayears[year] ) )
# Identify and join "bigrams" so ['são', 'paulo'] becomes ['são_paulo']
bigram = gensim.models.phrases.Phrases( [ abstract
for year in ayears
for abstract in ayears[year]['abstracts'] ] )
for year in ayears:
ayears[year]['abstracts'] = [ bigram[abstract] for abstract in ayears[year]['abstracts'] ]
yearmodels=dict()
if opt_load_models:
# Load existing models from saved files
for year in ayears:
model_path = '-'.join(['w2v', opt_model_name, str(year)])
if os.path.exists( model_path ):
yearmodels[year] = gensim.models.Word2Vec.load( model_path )
missing_years = set(ayears).difference( set(yearmodels) )
if missing_years:
basemodel_path = '-'.join(['w2v', opt_model_name, 'base'])
if os.path.exists( basemodel_path ):
# Load existing basemodel
basemodel = gensim.models.Word2Vec.load( basemodel_path )
else:
# Create the base model, hs=1 and negative=0 are required by .score()
basemodel = gensim.models.Word2Vec(workers=multiprocessing.cpu_count(),
iter = opt_model_iter,
hs=1, negative=0
)
# Create a copy of the abstracts as the original is needed for analysis
model_ayears = deepcopy(ayears)
# Shuffle abstracts, because order matters in training
for year in model_ayears:
index = list(model_ayears[year].index)
numpy.random.shuffle(index)
model_ayears[year].index = index
model_ayears[year].sort_index(inplace=True)
# Balance abstracts if we chose to
if opt_training_mode == 'nothing':
pass
elif opt_training_mode == 'randomfill':
# Fill them up to match the largest corpus to see if this fixes the size bias
maxsize = max( map( len, model_ayears.values() ) )
for year in model_ayears:
size = len(model_ayears[year])
factor = int(maxsize/size)
rest = maxsize - size*factor
model_ayears[year] = pandas.concat( [ model_ayears[year] for i in range(factor) ] \
+ [ model_ayears[year][:rest] ], ignore_index=True )
elif opt_training_mode == 'randomsample':
# Cut to same size to see if this fixes the size bias
minsize = 1000 # min( map(len, model_ayears.values() )) -> 96 and 98 were strong
for year in model_ayears:
model_ayears[year] = model_ayears[year][:minsize]
else:
raise Exception('opt_training_mode not found: ' + opt_training_mode)
# Build common vocabulary from all years
basemodel.build_vocab( pandas.concat( [model_ayears[year]['abstracts'] for year in model_ayears],
ignore_index=True ) )
# Save the basemodel
basemodel.save( basemodel_path )
print(basemodel)
# Create a model for each year and train it
for year in sorted(missing_years):
yearmodels[year] = deepcopy(basemodel)
yearmodels[year].train(model_ayears[year]['abstracts'], total_examples=len(model_ayears[year]))
yearmodels[year].save( '-'.join(['w2v', opt_model_name, str(year)]) )
def calc_probs(ayears, yearmodels):
# score() takes a list [s] of sentences here; could also be a sentence generator
sentlist = [ abstract for year in ayears for abstract in ayears[year]['abstracts'] ]
# the log likelihood of each sentence in this review under each w2v representation
llhd = numpy.array( [ yearmodels[year].score(sentlist, len(sentlist))
for year in ayears ] )
# now exponentiate to get likelihoods,
lhd = numpy.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
# normalize across models (stars) to get sentence-star probabilities
prob = pandas.DataFrame( (lhd/lhd.sum(axis=0)).transpose(), columns=ayears )
# and finally average the sentence probabilities to get the review probability
prob['year'] = [ year for year in ayears for abstract in ayears[year]['abstracts'] ]
prob['pos'] = pandas.concat( [ ayears[year].index.to_series() for year in ayears ], ignore_index=True)
return prob
def calc_probs2(ayears, yearmodels):
for year_a in ayears:
# the log likelihood of each sentence in this review under each w2v representation
llhd = pandas.DataFrame( ( yearmodels[year_m].score( ayears[year_a]['abstracts'], len(ayears[year_a]) )
for year_m in ayears ),
index=yearmodels)
# now exponentiate to get likelihoods,
lhd = numpy.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
# normalize across models (stars) to get sentence-star probabilities
plhd = (lhd/lhd.sum(axis=0)).T
ayears[year_a] = pandas.concat( [ayears[year_a], plhd], axis=1)
#probs = calc_probs(ayears, yearmodels)
calc_probs2(ayears, yearmodels)
ayears[1992]
def gen_plots(ayears):
probs = list(range(1991,2017))
# Each subplot is the model trained from that year crossed with the corpus for each year
for year in ayears:
ayears[year]['year']=year
allyears = pandas.concat( [ayears[year] for year in ayears] )
axs = allyears.boxplot( probs, by='year', whis=[5,95], showmeans=1, layout=(9,3),
figsize=(17,17), return_type='axes')
for ax in axs.values():
ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
# Each subplot is the corpus for that year crossed with each of the models
plt.figure(figsize=(17,17))
for i, target in enumerate( ayears ):
plt.subplot(9,3,i+1, label=str(target))
ax = ayears[target].boxplot( probs, whis=[5,95], showmeans=1, return_type='axes')
ax.set_title(str(target))
ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
plt.tight_layout()
# Histogram of abstracts per year
plt.figure(figsize=(17,10))
plt.bar(ayears, [len(ayears[years]) for years in ayears])
plt.title('Number of abstracts per year')
gen_plots(ayears)
def calc_ratios(ayears):
# Calculate the ratio between an abstracts likelihood in its own year and on the next
nextyear = dict( (y, list(ayears)[i+1]) for i, y in enumerate(list(ayears)[:-1]) )
prevyear = dict( (y, list(ayears)[i-1]) for i, y in enumerate(list(ayears)[1:]) )
for year in list(ayears)[:-1]:
ayears[year]['rnext'] = ayears[year].loc[ :, nextyear[year] ] / ayears[year].loc[ :, year ]
for year in list(ayears)[1:]:
ayears[year]['rprev'] = ayears[year].loc[ :, year ] / ayears[year].loc[ :, prevyear[year] ]
for year in list(ayears)[1:-1]:
ayears[year]['rjump'] = ayears[year].loc[ :, nextyear[year] ] / ayears[year].loc[ :, prevyear[year] ]
def find_ratios(ayear, year, ratio='rjump'):
# Find the 10 largest ratios for a given year
return ayears[year][ratio].sort_values(ascending=False)[:10]
def print_ratios(ayears):
for year in list(ayears)[1:-1]:
print(year, '\n', find_ratios(ayears, year), '\n')
calc_ratios(ayears)
print_ratios(ayears)
def print_corr(ayears, ratio='rjump'):
if False:
for year in list(ayears)[1:-1]:
print('\n\n', year, '\n')
cyears = ayears[year][[ratio,'citations']]
print('\n\npearson\n', cyears.corr())
print('\n\kendall\n', cyears.corr('kendall'))
print('\n\spearman\n', cyears.corr('spearman'))
plt.figure(figsize=(17,17))
for i, target in enumerate( list(ayears)[1:-1] ):
# plt.subplot(8, 3,i+1, label=str(target))
cyears = ayears[target][[ratio,'citations']]
ax = cyears.plot( 'citations', ratio, kind='scatter', logy=True)
ax.set_title(str(target))
# ax.set_xticklabels([x.get_text()[-2:] for x in ax.get_xticklabels()])
# plt.tight_layout()
print_corr(ayears)
print_corr(ayears, 'rnext')
# Some inspection functions
def readindex(year, index):
# Reads the abstract with given index for the year given above
print( ' '.join(ayears[year]['abstracts'][index]) )
def wordinfo(word):
# Print count and similar words for a given word
for year in ayears:
m=yearmodels[year]
print(year)
print('Count: ', m.vocab[word].count)
print(m.most_similar(word, topn=3))
# Check if some abstracts are too small or too big
def list_abstracts_with_length(ayears, low=100, high=sys.maxsize):
for year in ayears:
selected=[]
for i, item in enumerate(ayears[year]['abstracts']):
if len(item) < low or len(item)>high:
selected.append((i,item))
print('len', year, [ len(x[1]) for x in selected])
print('idx', year, [ x[0] for x in selected])
print()
def wordstuff():
# Creates a list of words sorted by their total count
words = [ w for w, c in sorted(basemodel.vocab.items(), key=lambda x: x[1], reverse=True) ]
# Log plot of the word count in the order set above
plt.bar( [i for i in range(len(words))], numpy.log([basemodel.vocab[w].count for w in words]) )
# Prints some words with extreme counts:
for w in words[:10]+words[-10:]:
print(w, basemodel.vocab[w].count)