Source code for tethne.writers.corpus

"""
"""

from collections import Counter
from itertools import repeat
import codecs
import os
import csv

from tethne import FeatureSet, StructuredFeatureSet

import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
    unicode = str


[docs]def write_documents(corpus, target, featureset_name, metadata_fields=[]):
    """
    Parameters
    ----------


    """

    docpath = target + '_docs.txt'
    metapath = target + '_meta.csv'

    features = corpus.features[featureset_name].features
    ftype = type(corpus.features[featureset_name])
    index = corpus.features[featureset_name].index

    try:
        docFile = open(docpath, 'wb')
    except IOError:
        raise IOError('Invalid target. Could not open files for writing.')

    # Generate metadata.
    with codecs.open(metapath, 'w', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([corpus.index_by] + list(metadata_fields))
        for i, p in corpus.indexed_papers.iteritems():
            getter = lambda m: getattr(p, m) if hasattr(p, m) else None
            writer.writerow([i] + list(map(getter, metadata_fields)))

    # Write documents content.
    with codecs.open(docpath, 'w', encoding='utf-8') as f:
        for i, p in corpus.indexed_papers.iteritems():
            if i in features:
                row = [i, u'en']
                if ftype is FeatureSet:
                    row += [u' '.join(repeat(e, c)) for e, c in features[i]]
                elif ftype is StructuredFeatureSet:
                    row += features[i]
                f.write(u'\t'.join(row) + u'\n')

    return docpath, metapath


[docs]def write_documents_dtm(corpus, target, featureset_name, slice_kwargs={},
                        metadata_fields=['date','title']):
    """

    Parameters
    ----------
    target : str
        Target path for documents; e.g. './mycorpus' will result in
        './mycorpus-mult.dat', './mycorpus-seq.dat', 'mycorpus-vocab.dat', and
        './mycorpus-meta.dat'.
    D : :class:`.Corpus`
        Contains :class:`.Paper` objects generated from the same DfR dataset
        as t_ngrams, indexed by doi and sliced by date.
    feature : str
        (default: 'unigrams') Features in :class:`.Corpus` to use for
        modeling.
    fields : list
        (optional) Fields in :class:`.Paper` to include in the metadata file.

    Returns
    -------
    None : If all goes well.

    Raises
    ------
    IOError
    """

    metapath = target + '-meta.dat'
    multpath = target + '-mult.dat'
    seqpath = target + '-seq.dat'
    vpath = target + '-vocab.dat'

    lookup = corpus.features[featureset_name].lookup
    index = corpus.features[featureset_name].index
    features = corpus.features[featureset_name].features

    # Generate -mult.dat file (wordcounts for each document).
    #   From the DTM example:
    #
    #     one-doc-per-line, each line of the form
    #         unique_word_count index1:count1 index2:count2 ... indexn:counnt
    #     The docs in foo-mult.dat should be ordered by date, with the first
    #     docs from time1, the next from time2, ..., and the last docs from
    #     timen.
    #
    N = Counter()
    for date, subcorpus in corpus.slice(**slice_kwargs):
        with codecs.open(multpath, 'w', encoding='utf-8') as f:
            for p in subcorpus.papers:
                i = getattr(p, subcorpus.index_by)
                N[date] += 1
                docLine = [u':'.join([unicode(lookup[e]), unicode(c)])
                           for e,c in features[i]]
                unique = unicode(len(features[i]))
                f.write(u' '.join([unique] + docLine) + '\n')

    # And -meta.dat file (with DOIs).
    #
    #       a file with information on each of the documents, arranged in
    #           the same order as the docs in the mult file.
    #
    with codecs.open(metapath, 'w', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['id'] + list(metadata_fields))
        for date, subcorpus in corpus.slice(**slice_kwargs):
            for p in subcorpus.papers:
                getter = lambda m: getattr(p, m) if hasattr(p, m) else None
                fieldData = map(getter, metadata_fields)
                writer.writerow([getattr(p, corpus.index_by)] + list(fieldData))

    # Generate -seq.dat file (number of papers per year).
    #   From the DTM example:
    #
    #       Number_Timestamps
    #       number_docs_time_1
    #       ...
    #       number_docs_time_i
    #       ...
    #       number_docs_time_NumberTimestamps
    #
    with open(seqpath, 'w') as f:
        for date in sorted(N.keys()):
            f.write(u'{date}\n'.format(date=N[date]))

    #       a file with all of the words in the vocabulary, arranged in
    #       the same order as the word indices
    with codecs.open(vpath, 'w', encoding='utf-8') as f:
        f.write(u'\n'.join([index[i] for i in sorted(index.keys())]))