Source code for tethne.writers.corpora
"""
"""
from collections import Counter
[docs]def to_documents(target, ngrams, metadata=None, vocab=None):
"""
Parameters
----------
target : str
Target path for documents; e.g. './mycorpus' will result in
'./mycorpus_docs.txt' and './mycorpus_meta.csv'.
ngrams : dict
Keys are paper identifiers, values are lists of (ngram, frequency)
tuples. If `vocab` is provided, assumes that `ngram` is an index into
`vocab`.
metadata : tuple
(`keys`, dict): `keys` is a list of metadata keys, and dict contains
metadata values dict for each paper. ( [ str ], { str(p) : dict } )
Raises
------
IOError
"""
docpath = target + '_docs.txt'
metapath = target + '_meta.csv'
try:
docFile = open(docpath, 'wb')
except IOError:
raise IOError('Invalid target. Could not open files for writing.')
if metadata is not None:
metakeys, metadict = metadata
metaFile = open(metapath, 'wb')
metaFile.write('{0}\n'.format('\t'.join(['id'] + metakeys)))
# MALLET expects strings; if `vocab` is provided, assumes that ngrams
# in `ngrams` are keys into `vocab`.
if vocab is None:
def word(s):
return s # unidecode(unicode(s))
else:
def word(s):
return vocab[s] # unidecode(unicode(vocab[s]))
try:
for p,grams in ngrams.iteritems():
# Write documents.
m = [ p, 'en' ] # Add doc name and language before data.
dat = [ word(gram) for gram,freq in grams for i in xrange(freq) ]
docFile.write(' '.join( m + dat) + '\n')
# Write metadata.
meta = [ str(p) ]
if metadata is not None:
meta += [ str(metadict[p][f]) for f in metakeys ]
metaFile.write('\t'.join(meta) + '\n')
except AttributeError: # .iteritems() raises an AttributeError if ngrams
# is not dict-like.
raise ValueError('Parameter \'ngrams\' must be a dict.')
docFile.close()
if metadata is not None:
metaFile.close()
return docpath, metapath
[docs]def to_dtm_input(target, D, feature='unigrams', fields=['date','atitle']):
"""
Parameters
----------
target : str
Target path for documents; e.g. './mycorpus' will result in
'./mycorpus-mult.dat', './mycorpus-seq.dat', 'mycorpus-vocab.dat', and
'./mycorpus-meta.dat'.
D : :class:`.Corpus`
Contains :class:`.Paper` objects generated from the same DfR dataset
as t_ngrams, indexed by doi and sliced by date.
feature : str
(default: 'unigrams') Features in :class:`.Corpus` to use for
modeling.
fields : list
(optional) Fields in :class:`.Paper` to include in the metadata file.
Returns
-------
None : If all goes well.
Raises
------
IOError
"""
try:
metaFile = open(target + '-meta.dat', 'wb')
except IOError:
raise IOError('Invalid target. Could not open files for writing.')
vocab = D.features[feature]['index']
features = D.features[feature]['features']
seq = {}
# Generate -mult.dat file (wordcounts for each document).
# From the DTM example:
#
# one-doc-per-line, each line of the form
# unique_word_count index1:count1 index2:count2 ... indexn:counnt
# The docs in foo-mult.dat should be ordered by date, with the first
# docs from time1, the next from time2, ..., and the last docs from
# timen.
#
# And -meta.dat file (with DOIs).
#
# a file with information on each of the documents, arranged in
# the same order as the docs in the mult file.
#
with open(target + '-meta.dat', 'wb') as metaFile:
metaFile.write('\t'.join(['id'] + fields ) + '\n')
with open(target + '-mult.dat', 'wb') as multFile:
for year in D.axes['date'].keys():
papers = D.axes['date'][year]
seq[year] = []
for id in papers:
try:
grams = features[id]
seq[year].append(id)
wordcount = len(grams) # Number of unique words.
# Write data.
mdat = [ '{0}:{1}'.format(g,c) for g,c in grams ]
mdat_string = ' '.join([ str(wordcount) ] + mdat) + '\n'
multFile.write(mdat_string)
# Write metadata.
meta = [ str(id) ]
if papers:
p = D.papers[id]
meta += [ str(p[f]) for f in fields ]
metaFile.write('\t'.join(meta) + '\n')
except KeyError: # May not have data for each Paper.
pass
# Generate -seq.dat file (number of papers per year).
# From the DTM example:
#
# Number_Timestamps
# number_docs_time_1
# ...
# number_docs_time_i
# ...
# number_docs_time_NumberTimestamps
#
with open(target + '-seq.dat', 'wb') as seqFile:
seqFile.write(str(len(seq)) + '\n')
for year, papers in sorted(seq.items()):
seqFile.write('{0}\n'.format(len(papers)))
# a file with all of the words in the vocabulary, arranged in
# the same order as the word indices
with open(target + '-vocab.dat', 'wb') as vocabFile:
for index,word in sorted(vocab.items()):
vocabFile.write('{0}\n'.format(word))
return None