SciPy

Source code for tethne.persistence.hdf5.dtmmodel

import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel('DEBUG')

from util import *
from ...model import DTMModel

import numpy

[docs]class HDF5DTMModel(DTMModel): """ Provides HDF5 persistence for :class:`.DTMModel`\. If ``datapath`` is provided, and points to a real HDF5 file, then no other parameters need be provided. Parameters ---------- e_theta : matrix-like Distribution of topics (Z) in documents (M). Shape: (Z, M). phi : matrix-like Topic (Z) distribution over words (W), over time (T). Shape: (Z, W, T) metadata : dict Maps matrix indices onto document datadata. vocabulary : dict Maps W indices onto words. datapath : str (optional) Path to an HDF5 repository. If not provided, generates a temporary path, which can be accessed as the ``.path`` attribute. """ def __init__( self, e_theta=None, phi=None, metadata=None, vocabulary=None, datapath=None ): """ Initialize the :class:`.HDF5DTMModel`\. """ logger.debug('HDF5DTMModel: initialize.') self.h5file, self.path, self.uuid = get_h5file('DTMModel', datapath) logger.debug('HDF5DTMModel: got h5file at path {0}'.format(self.path)) # Load or create arrays group. self.agroup = get_or_create_group(self.h5file, 'arrays') logger.debug('HDF5DTMModel: initialized array group.') self.e_theta = get_or_create_array( self.h5file, self.agroup, 'e_theta', e_theta) self.Z = self.e_theta.shape[0] # Number of topics. self.M = self.e_theta.shape[1] # Number of documents. logger.debug('HDF5DTMModel: initialized theta with shape {0}' .format(self.e_theta.shape)) self.phi = get_or_create_array(self.h5file, self.agroup, 'phi', phi) self.W = self.phi.shape[1] # Number of words. self.T = self.phi.shape[2] # Number of time periods. logger.debug('HDF5DTMModel: initialized phi with shape {0}' .format(self.phi.shape)) self.metadata = HDF5Metadata(self.h5file, metadata) logger.debug('HDF5DTMModel: initialized metadata with {0} records' .format(len(self.metadata))) self.vgroup = get_or_create_group(self.h5file, 'vocabulary') logger.debug('HDF5DTMModel: initialized vocabulary group') if 'vocabulary' not in self.vgroup: vocab_sorted = [ vocabulary[k] for k in sorted(vocabulary.keys()) ] else: vocab_sorted = [] self.vocabulary = HDF5ArrayDict( self.h5file, self.vgroup, 'vocabulary', vocab_sorted ) self.h5file.flush() logger.debug('HDF5DTMModel: initialized vocabulary with {0} entries' .format(len(self.vocabulary))) # Doesn't get stored. self.lookup = { v['id']:k for k,v in self.metadata.iteritems() } logger.debug('HDF5DTMModel: initialization complete')
[docs]def to_hdf5(model, datapath=None): """ Generate a :class:`.HDF5DTMModel` from the current instance. Parameters ---------- model : :class:`.DTMModel` datapath : str (optional) Path to an HDF5 repository. If not provided, generates a temporary path, which can be accessed as the ``.path`` attribute. Returns ------- hdf5_model : :class:`.HDF5DTMModel` """ metadata = { k:v for k,v in model.metadata.iteritems() } vocabulary = { k:v for k,v in model.vocabulary.iteritems() } hdf5_model = HDF5DTMModel( e_theta = numpy.array(model.e_theta), phi = numpy.array(model.phi), metadata = metadata, vocabulary = vocabulary, datapath = datapath ) return hdf5_model
[docs]def from_hdf5(HD_or_path): """ Load a :class:`.DTMModel` from a :class:`.HDF5DTMModel`\. Parameters ---------- HD_or_path : str or :class:`.HDF5DTMModel` If str, must be a path to a :class:`.HDF5DTMModel` HDF5 repo. Returns ------- model : :class:`.DTMModel` Examples -------- From a path: .. code-block:: python >>> model = from_hdf5('/path/to/my/HDF5DTMModel.h5') """ if type(HD_or_path) is str: hmodel = HDF5DTMModel(datapath=HD_or_path) elif type(HD_or_path) is HDF5DTMModel: hmodel = HD_or_path else: raise AttributeError('Must provide datapath or HDF5DTMModel object.') metadata = { k:v for k,v in hmodel.metadata.iteritems() } vocabulary = { k:v for k,v in hmodel.vocabulary.iteritems() } model = DTMModel( e_theta=numpy.array(hmodel.e_theta), phi=numpy.array(hmodel.phi), metadata=metadata, vocabulary=vocabulary ) return model