Source code for tethne.persistence.hdf5.corpus

import logging
logging.basicConfig(filename=None, format='%(asctime)-6s: %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

import numpy
import tables
from ...classes import Paper, Corpus
import tempfile
import uuid
import cPickle as pickle
import urllib
from unidecode import unidecode

from util import *

[docs]class HDF5Corpus(Corpus):
    """
    Provides HDF5 persistence for :class:`.Corpus`\.
    
    The :class:`.HDF5Corpus` uses a variety of tables and arrays to
    store data. The structure of a typical HDF5 repository for an instance
    of this class is:
    
    * ``/``

      * ``arrays/``
      
        * ``authors``: VLArray (String), :class:`.vlarray_dict`
                Maps author indices in ``authors_index`` onto the IDs of papers that they
                authored. Padded with an empty 0th entry.

        * ``authors_index``: EArray (String), see :class:`.vlarray_dict`
                Maps author indices used in ``authors`` to string representations of 
                author names (LAST F). Padded with an empty 0th entry.

        * ``papers_citing``: VLArray (String), :class:`.vlarray_dict`
                Each row corresponds to a paper, and contains a set of IDs for the papers
                that cite that paper. Row indices correspond to the entries in
                ``papers_citing_index``. Padded with an empty 0th entry.

        * ``papers_citing_index``: EArray (String), see :class:`.vlarray_dict`
                Maps paper indices used in ``papers_citing`` to string paper IDs.
                Padded with an empty 0th entry.

      * ``axes/``

            Each slice axis is represented by a VLArray (``[slice axis]``) and an EArray
            (``[slice_axis]_keys``).

        * ``[slice axis]`` (e.g. ``date``): VLArray (String)
                Each row is a slice, containing a variable-length array of paper IDs.

        * ``[slice axis]_keys`` (e.g. ``date_keys``): EArray (Int32 or String)
                Maps row indices in ``[slice axis]`` onto slice names/keys.
      
      * ``citations``/
      
        * ``papers_table``: Table, see :class:`.papers_table`

                Contains metadata about cited references. These are usually not the same
                papers as those described in ``papers/``.
      
      * ``features``/
                This group contains data for featuresets. Each featureset has its own
                subgroup, as described below.

        * ``[featureset name]/``     
        
          * ``counts``: Array
                Overall frequency for features across the whole Corpus.

          * ``documentCounts``: Array
                Number of papers in which each feature occurs.

          * ``index``: Array
                Maps indices in ``counts`` and ``documentCounts`` onto string
                representations of each feature.

          * ``features/``
                Contains sparse frequency vectors over features for documents. Each row
                in the arrays belows corresponds to a single document. The values of
                ``indices`` are feature indices for each document, and the values of 
                ``values`` are the frequencies themselves. ``indices_keys`` and
                ``values_keys`` should be identical, and map the rows in ``indices`` and
                ``values`` onto paper IDs.

                Thus a sparse frequency vector over features for a document can be
                reconstructed as ``freq[d,:] = [ (I[d,0],V[d,0]) ... (I[d,N],V[d,N])]``,
                where ``I`` is the variable-length array ``indices`` and ``V`` is the 
                variable-length array ``values``, and ``N`` is the length of the slice
                ``I[d,:]``.
                
            * ``indices``: VLArray
            * ``indices_keys``: EArray
            * ``values``: VLArray
            * ``values_keys``: Earray

          * ``papers/``
                Contains sparse frequency vectors over documents for features. Same
                structure as in ``features/``, above, except that rows correspond to
                features and ``indices`` contain variable-length arrays of paper IDs.

      
      * ``papers/``

        * ``papers_table``: Table, see :class:`.papers_table`
                Contains metadata about the papers in this Corpus.
    
    Since some data types (e.g. list, tuple) are not supported in PyTables/HDF5,
    we make use of cPickle serialization. For example, sparse feature vectors
    (lists of tuples) are pickled for storage in a StringCol.
    """
        

    def __init__(self, papers, features=None, index_by='wosid',
                       index_citation_by='ayjid', exclude=set([]),
                       filt=None, datapath=None, index=True):
        """
        
        Parameters
        ----------
        papers : list
            A list of :class:`.Paper`
        features : dict
            Contains dictionary `{ type: { i: [ (f, w) ] } }` where `i` is an 
            index for papers (see kwarg `index_by`), `f` is a feature (e.g. an 
            N-gram), and `w` is a weight on that feature (e.g. a count).
        index_by : str
            A key in :class:`.Paper` for indexing. If `features` is provided, 
            then this must by the field from which indices `i` are drawn. For 
            example, if a dictionary in `features` describes DfR wordcounts for
            the :class:`.Paper`\s in `data`, and is indexed by DOI, then 
            `index_by` should be 'doi'.
        index_citations_by : str
            Just as ``index_by``, except for citations.
        exclude : set
            (optional) Features to ignore, e.g. stopwords.
        datapath : str
            (optional) Target path for HDF5 repository. If not provided, will
            generate a temporary directory in ``/tmp`` (or equivalent). The full
            path to the HDF5 repo can be found in the ``path`` attribute after
            initialization.
        index : bool
            (default: True) If True, runs :func:`.index`\.
        """
        
        logger.debug('Initialize HDF5Corpus with {0} papers'
                                                           .format(len(papers)))

        # Where to save the HDF5 data file?
        if datapath is None:
            self.datapath = tempfile.mkdtemp()
            logger.debug('Generated datapath {0}.'.format(self.datapath))
        else:
            self.datapath = datapath
        
        # Load or create HDF5 repository.
        if self.datapath.split('.')[-1] == 'h5':
            self.path = self.datapath
            title = ''
        else:   # New h5 file.
            self.uuid = uuid.uuid4()    # Unique identifier for this Corpus.
            logger.debug('Datapath has UUID {0}.'.format(self.uuid))
            self.path = '{0}/Corpus-{1}.h5'.format( self.datapath,
                                                            self.uuid   )
            title = 'Corpus-{0}'.format(self.uuid)

        # mode = 'a' will create a new file if no file exists.
        self.h5file = tables.openFile(self.path, mode = 'a', title=title)
                                   
        # Load or create arrays group.
        if '/arrays' not in self.h5file:
            self.group = self.h5file.createGroup("/", 'arrays')
        else:
            self.group = self.h5file.getNode('/arrays')
        
        logger.debug('Initialize features...')
        self.features = HDF5Features(self.h5file)
        logger.debug('Initialize authors...')
        self.authors = vlarray_dict(self.h5file, self.group, 
                                    'authors', tables.StringAtom(100),
                                               tables.StringAtom(100))

        # { str(f) : feature }
        logger.debug('Initialize citations...')
        self.citations = papers_table(self.h5file, index_citation_by,
                                                   'citations')

        logger.debug('Initialize papers...')
        self.papers = papers_table(self.h5file, index_by, 'papers', 
                                        citations=self.citations,
                                        index_citation_by=index_citation_by)

        # { str(f) : [ str(p) ] }
        logger.debug('Initialize papers_citing...')        
        self.papers_citing = vlarray_dict(self.h5file, self.group,
                                        'papers_citing',
                                        tables.StringAtom(100),
                                        tables.StringAtom(100))
        
        self.axes = HDF5Axes(self.h5file)
        self.index_by = index_by    # Field in Paper, e.g. 'wosid', 'doi'.
        self.index_citation_by = index_citation_by        
        
        if index:
            logger.debug('Index Corpus...')
            self.index(papers, features, index_by, index_citation_by,
                                                   exclude, filt)
    
        logger.debug('HDF5Corpus initialized, flushing to force save.')
        self.h5file.flush()
        
[docs]    def abstract_to_features(self, remove_stopwords=True):
        """
        See :func:`.Corpus.abstract_to_features`\.
        
        Parameters
        ----------
        remove_stopwords : bool
            (default: True) If True, passes tokenizer the NLTK stoplist.        
        """

        super(HDF5Corpus, self).abstract_to_features(remove_stopwords)
        self.h5file.flush()
        
[docs]    def filter_features(self, fold, fnew, filt):
        """
        See :func:`.Corpus.filter_features`\.
        
        Parameters
        ----------
        fold : str
            Key into ``features`` for existing featureset.
        fnew : str
            Key into ``features`` for resulting featuresset.
        filt : method
            Filter function to apply to the featureset. Should take a feature
            dict as its sole parameter.
        """    

        self.h5file.flush()                
        super(HDF5Corpus, self).filter_features(fold, fnew, filt)
        self.h5file.flush()

[docs]def to_hdf5(obj, datapath=None):
    """
    Transforms a :class:`.Corpus` into a :class:`.HDF5Corpus`\.
    
    Use this method to store your :class:`.Corpus`\, e.g. to archive data
    associated with your study or project.
    
    Parameters
    ----------
    datapath : str
        If provided, will create the new :class:`.HDF5Corpus` at
        that location.
    
    Returns
    -------
    HD : :class:`.HDF5Corpus`
    
    Examples
    --------
    
    .. code-block:: python
    
       >>> HC = C.to_hdf5(datapath='/path/to/my/archive')
       
    """

    # Initialize, but don't index.
    HD = HDF5Corpus([], index_by=obj.index_by,
                        index_citation_by=obj.index_citation_by,
                        datapath=datapath,
                        index=False )

    HD = _migrate_values(obj, HD)
    return HD

[docs]def from_hdf5(HD_or_path):
    """
    Load or transform a :class:`.HDF5Corpus` into a :class:`.Corpus`\.
    
    If `HD_or_path` is a string, will attempt to load the 
    :class:`.HDF5Corpus` from that path.
    
    Parameters
    ----------
    HD_or_path : str or :class:`.HDF5Corpus`
        If str, must be a path to a :class:`.HDF5Corpus` HDF5 repo.
    
    Returns
    -------
    D : :class:`.Corpus`
    
    Examples
    --------

    .. code-block:: python
    
       >>> C = from_hdf5('/path/to/my/archive/MyH5Corpus.h5')
    """

    if HD_or_path is str:
        hd = HDF5Corpus([], index=False, datapath=HD_or_path)
    elif type(HD_or_path) is HDF5Corpus:
        hd = HD_or_path

    D = _migrate_values(hd, Corpus([], index=False))
    return D

def _migrate_values(fromD, toD):
    """
    Transfers properties from one :class:`.Corpus` to another.
    
    `fromD` and `toD` can by anything that behaves like a 
    :class:`.Corpus`\, including a :class:`.HDF5Corpus`\.
    
    Parameters
    ----------
    fromD : :class:`.Corpus`
        Source :class:`.Corpus`
    toD : :class:`.Corpus`
        Target :class:`.Corpus`
        
    Returns
    -------
    toD : :class:`.Corpus`
        Updated target :class:`.Corpus`
    """
    
    logger.debug('migrate values')

    # Transfer papers.
    for k,v in fromD.papers.iteritems():
        toD.papers[k] = v
    logger.debug('papers: {0}->{1}'.format(len(fromD.papers), len(toD.papers)))

    # Transfer citations.
    for k,v in fromD.citations.iteritems():
        toD.citations[k] = v
    logger.debug('citations: {0}->{1}'
                              .format(len(fromD.citations), len(toD.citations)))

    for k,v in fromD.papers_citing.iteritems():
        toD.papers_citing[k] = v
    logger.debug('papers_citing: {0}->{1}'
                      .format(len(fromD.papers_citing), len(toD.papers_citing)))

    # Transfer authors.
    for k,v in fromD.authors.iteritems():
        toD.authors[k] = v
    logger.debug('authors: {0}->{1}'
                                  .format(len(fromD.authors), len(toD.authors)))

    # Transfer features.
    for k, v in fromD.features.iteritems():
        toD._define_features(   k, v['index'], v['features'], v['counts'],
                                   v['documentCounts'], v['papers']    )
    logger.debug('features: {0}->{1}'
                                .format(len(fromD.features), len(toD.features)))

    # Transfer axes.
    for k,v in fromD.axes.iteritems():
        toD.axes[k] = v
    logger.debug('axes: {0}->{1}'.format(len(fromD.axes), len(toD.axes)))

    toD.N_a = len(fromD.authors)
    toD.N_c = len(fromD.citations)
    toD.N_p = len(fromD.papers)
    
    toD.index_by = fromD.index_by
    toD.index_citation_by = fromD.index_citation_by

    return toD