Source code for tethne.writers.corpora

"""
"""

from collections import Counter

[docs]def to_documents(target, ngrams, metadata=None, vocab=None):
    """
    
    Parameters
    ----------
    target : str
        Target path for documents; e.g. './mycorpus' will result in 
        './mycorpus_docs.txt' and './mycorpus_meta.csv'.
    ngrams : dict
        Keys are paper identifiers, values are lists of (ngram, frequency)
        tuples. If `vocab` is provided, assumes that `ngram` is an index into
        `vocab`.
    metadata : tuple
        (`keys`, dict): `keys` is a list of metadata keys, and dict contains
        metadata values dict for each paper. ( [ str ], { str(p) : dict } ) 
    
    Raises
    ------
    IOError
    """

    docpath = target + '_docs.txt'
    metapath = target + '_meta.csv'

    try:
        docFile = open(docpath, 'wb')
    except IOError:
        raise IOError('Invalid target. Could not open files for writing.')
    
    if metadata is not None:
        metakeys, metadict = metadata
        metaFile = open(metapath, 'wb')
        metaFile.write('{0}\n'.format('\t'.join(['id'] + metakeys)))
    
    # MALLET expects strings; if `vocab` is provided, assumes that ngrams
    #   in `ngrams` are keys into `vocab`.
    if vocab is None:
        def word(s):
            return s    # unidecode(unicode(s))
    else:
        def word(s):
            return vocab[s] # unidecode(unicode(vocab[s]))
    
    try:
        for p,grams in ngrams.iteritems():
            # Write documents.
            m = [ p, 'en' ] # Add doc name and language before data.
            dat = [ word(gram) for gram,freq in grams for i in xrange(freq) ]
            docFile.write(' '.join( m + dat) + '\n')
            
            # Write metadata.
            meta = [ str(p) ]
            if metadata is not None:
                meta += [ str(metadict[p][f]) for f in metakeys ]
                metaFile.write('\t'.join(meta) + '\n')

    except AttributeError:  # .iteritems() raises an AttributeError if ngrams
                            #  is not dict-like.
        raise ValueError('Parameter \'ngrams\' must be a dict.')
    
    docFile.close()
    
    if metadata is not None:
        metaFile.close()
    
    return docpath, metapath

[docs]def to_dtm_input(target, D, feature='unigrams', fields=['date','atitle']):
    """
    
    Parameters
    ----------
    target : str
        Target path for documents; e.g. './mycorpus' will result in 
        './mycorpus-mult.dat', './mycorpus-seq.dat', 'mycorpus-vocab.dat', and
        './mycorpus-meta.dat'.    
    D : :class:`.Corpus`
        Contains :class:`.Paper` objects generated from the same DfR dataset
        as t_ngrams, indexed by doi and sliced by date.
    feature : str
        (default: 'unigrams') Features in :class:`.Corpus` to use for
        modeling.
    fields : list
        (optional) Fields in :class:`.Paper` to include in the metadata file.
        
    Returns
    -------
    None : If all goes well.
    
    Raises
    ------
    IOError
    """

    try:
        metaFile = open(target + '-meta.dat', 'wb')
    except IOError:
        raise IOError('Invalid target. Could not open files for writing.')

    vocab = D.features[feature]['index']
    features = D.features[feature]['features']

    seq = {}
    # Generate -mult.dat file (wordcounts for each document).
    #   From the DTM example:
    #
    #     one-doc-per-line, each line of the form
    #         unique_word_count index1:count1 index2:count2 ... indexn:counnt
    #     The docs in foo-mult.dat should be ordered by date, with the first
    #     docs from time1, the next from time2, ..., and the last docs from
    #     timen.
    #
    # And -meta.dat file (with DOIs).
    #
    #       a file with information on each of the documents, arranged in
    #           the same order as the docs in the mult file.
    #
    with open(target + '-meta.dat', 'wb') as metaFile:
        metaFile.write('\t'.join(['id'] + fields ) + '\n')
    
        with open(target + '-mult.dat', 'wb') as multFile:
            for year in D.axes['date'].keys():
                papers = D.axes['date'][year]
                
                seq[year] = []
                for id in papers:
                    try:
                        grams = features[id]
                        seq[year].append(id)
                        wordcount = len(grams)  # Number of unique words.
                        
                        # Write data.
                        mdat = [ '{0}:{1}'.format(g,c) for g,c in grams ]
                        mdat_string = ' '.join([ str(wordcount) ] + mdat) + '\n'
                        multFile.write(mdat_string)
                        
                        # Write metadata.
                        meta = [ str(id) ]
                        if papers:
                            p = D.papers[id]
                            meta += [ str(p[f]) for f in fields ]
                        metaFile.write('\t'.join(meta) + '\n')
                        
                    except KeyError:    # May not have data for each Paper.
                        pass

    # Generate -seq.dat file (number of papers per year).
    #   From the DTM example:
    #
    #       Number_Timestamps
    #       number_docs_time_1
    #       ...
    #       number_docs_time_i
    #       ...
    #       number_docs_time_NumberTimestamps
    #
    with open(target + '-seq.dat', 'wb') as seqFile:
        seqFile.write(str(len(seq)) + '\n')
        for year, papers in sorted(seq.items()):
            seqFile.write('{0}\n'.format(len(papers)))

    #       a file with all of the words in the vocabulary, arranged in
    #       the same order as the word indices
    with open(target + '-vocab.dat', 'wb') as vocabFile:
        for index,word in sorted(vocab.items()):
            vocabFile.write('{0}\n'.format(word))

    return None