SciPy

Source code for tethne.persistence.hdf5.ldamodel

import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel('DEBUG')

from util import *
from ...model import LDAModel

[docs]class HDF5LDAModel(LDAModel): """ Provides HDF5 persistence for :class:`.LDAModel`\. """ def __init__( self, theta=None, phi=None, metadata=None, vocabulary=None, datapath=None): logger.debug('HDF5LDAModel: initialize.') self.h5file, self.path, self.uuid = get_h5file('LDAModel', datapath) logger.debug('HDF5LDAModel: got h5file at path {0}'.format(self.path)) # Load or create arrays group. self.agroup = get_or_create_group(self.h5file, 'arrays') logger.debug('HDF5LDAModel: initialized array group.') self.theta = get_or_create_array( self.h5file, self.agroup, 'theta', theta ) self.M = self.theta.shape[0] logger.debug('HDF5LDAModel: initialized theta with shape {0}' .format(self.theta.shape)) self.phi = get_or_create_array(self.h5file, self.agroup, 'phi', phi) self.Z = self.phi.shape[0] self.W = self.phi.shape[1] logger.debug('HDF5LDAModel: initialized phi with shape {0}' .format(self.phi.shape)) self.metadata = HDF5Metadata(self.h5file, metadata) logger.debug('HDF5LDAModel: initialized metadata with {0} records' .format(len(self.metadata))) self.vgroup = get_or_create_group(self.h5file, 'vocabulary') logger.debug('HDF5LDAModel: initialized vocabulary group') if 'vocabulary' not in self.vgroup: vocab_sorted = [ vocabulary[k] for k in sorted(vocabulary.keys()) ] else: vocab_sorted = [] self.vocabulary = HDF5ArrayDict( self.h5file, self.vgroup, 'vocabulary', vocab_sorted ) self.h5file.flush() logger.debug('HDF5LDAModel: initialized vocabulary with {0} entries' .format(len(self.vocabulary))) # Doesn't get stored. self.lookup = { v['id']:k for k,v in self.metadata.iteritems() } logger.debug('HDF5LDAModel: initialization complete')
[docs]def to_hdf5(model, datapath=None): """ Generate a :class:`.HDF5LDAModel` from the current instance. Parameters ---------- model : :class:`.LDAModel` datapath : str (optional) Path to an HDF5 repository. If not provided, generates a temporary path, which can be accessed as the ``.path`` attribute. Returns ------- hdf5_model : :class:`.HDF5LDAModel` """ hdf5_model = HDF5LDAModel( theta = numpy.array(model.theta), phi = numpy.array(model.phi), metadata = model.metadata, vocabulary = model.vocabulary, datapath = datapath ) return hdf5_model
[docs]def from_hdf5(HD_or_path): """ Load a :class:`.LDAModel` from a :class:`.HDF5LDAModel`\. Parameters ---------- HD_or_path : str or :class:`.HDF5LDAModel` If str, must be a path to a :class:`.HDF5LDAModel` HDF5 repo. Returns ------- model : :class:`.LDAModel` Examples -------- From a path: .. code-block:: python >>> model = from_hdf5('/path/to/my/HDF5LDAModel.h5') """ if type(HD_or_path) is str: hmodel = HDF5LDAModel(datapath=HD_or_path) elif type(HD_or_path) is HDF5LDAModel: hmodel = HD_or_path else: raise AttributeError('Must provide datapath or HDF5LDAModel object.') model = LDAModel( theta=hmodel.theta, phi=hmodel.phi, metadata=hmodel.metadata, vocabulary=hmodel.vocabulary ) return model