SciPy
Need help? Have a feature request? Please check out the tethne-users group .

Source code for tethne.writers.corpus

"""
"""

from collections import Counter
from itertools import repeat
import codecs
import os
import csv

from tethne import FeatureSet, StructuredFeatureSet

import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
    unicode = str


[docs]def write_documents(corpus, target, featureset_name, metadata_fields=[]): """ Parameters ---------- """ docpath = target + '_docs.txt' metapath = target + '_meta.csv' features = corpus.features[featureset_name].features ftype = type(corpus.features[featureset_name]) index = corpus.features[featureset_name].index try: docFile = open(docpath, 'wb') except IOError: raise IOError('Invalid target. Could not open files for writing.') # Generate metadata. with codecs.open(metapath, 'w', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow([corpus.index_by] + list(metadata_fields)) for i, p in corpus.indexed_papers.iteritems(): getter = lambda m: getattr(p, m) if hasattr(p, m) else None writer.writerow([i] + list(map(getter, metadata_fields))) # Write documents content. with codecs.open(docpath, 'w', encoding='utf-8') as f: for i, p in corpus.indexed_papers.iteritems(): if i in features: row = [i, u'en'] if ftype is FeatureSet: row += [u' '.join(repeat(e, c)) for e, c in features[i]] elif ftype is StructuredFeatureSet: row += features[i] f.write(u'\t'.join(row) + u'\n') return docpath, metapath
[docs]def write_documents_dtm(corpus, target, featureset_name, slice_kwargs={}, metadata_fields=['date','title']): """ Parameters ---------- target : str Target path for documents; e.g. './mycorpus' will result in './mycorpus-mult.dat', './mycorpus-seq.dat', 'mycorpus-vocab.dat', and './mycorpus-meta.dat'. D : :class:`.Corpus` Contains :class:`.Paper` objects generated from the same DfR dataset as t_ngrams, indexed by doi and sliced by date. feature : str (default: 'unigrams') Features in :class:`.Corpus` to use for modeling. fields : list (optional) Fields in :class:`.Paper` to include in the metadata file. Returns ------- None : If all goes well. Raises ------ IOError """ metapath = target + '-meta.dat' multpath = target + '-mult.dat' seqpath = target + '-seq.dat' vpath = target + '-vocab.dat' lookup = corpus.features[featureset_name].lookup index = corpus.features[featureset_name].index features = corpus.features[featureset_name].features # Generate -mult.dat file (wordcounts for each document). # From the DTM example: # # one-doc-per-line, each line of the form # unique_word_count index1:count1 index2:count2 ... indexn:counnt # The docs in foo-mult.dat should be ordered by date, with the first # docs from time1, the next from time2, ..., and the last docs from # timen. # N = Counter() for date, subcorpus in corpus.slice(**slice_kwargs): with codecs.open(multpath, 'w', encoding='utf-8') as f: for p in subcorpus.papers: i = getattr(p, subcorpus.index_by) N[date] += 1 docLine = [u':'.join([unicode(lookup[e]), unicode(c)]) for e,c in features[i]] unique = unicode(len(features[i])) f.write(u' '.join([unique] + docLine) + '\n') # And -meta.dat file (with DOIs). # # a file with information on each of the documents, arranged in # the same order as the docs in the mult file. # with codecs.open(metapath, 'w', encoding='utf-8') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(['id'] + list(metadata_fields)) for date, subcorpus in corpus.slice(**slice_kwargs): for p in subcorpus.papers: getter = lambda m: getattr(p, m) if hasattr(p, m) else None fieldData = map(getter, metadata_fields) writer.writerow([getattr(p, corpus.index_by)] + list(fieldData)) # Generate -seq.dat file (number of papers per year). # From the DTM example: # # Number_Timestamps # number_docs_time_1 # ... # number_docs_time_i # ... # number_docs_time_NumberTimestamps # with open(seqpath, 'w') as f: for date in sorted(N.keys()): f.write(u'{date}\n'.format(date=N[date])) # a file with all of the words in the vocabulary, arranged in # the same order as the word indices with codecs.open(vpath, 'w', encoding='utf-8') as f: f.write(u'\n'.join([index[i] for i in sorted(index.keys())]))