"""
Helper classes and methods for :mod:`tethne.persistence.hdf5`\.
TODO: move away from index table pattern, toward index array pattern.
"""
import logging
logging.basicConfig(filename=None, format='%(asctime)-6s: %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')
import numpy
import tables
import tempfile
import uuid
import cPickle as pickle
import urllib
from unidecode import unidecode
import os
from ...classes import Paper
pytype = { tables.atom.BoolAtom: bool,
tables.atom.UInt8Atom: int,
tables.atom.Int16Atom: int,
tables.atom.UInt16Atom: int,
tables.atom.Int32Atom: int,
tables.atom.UInt32Atom: long,
tables.atom.Int64Atom: long,
tables.atom.UInt64Atom: long,
tables.atom.Float32Atom: float,
tables.atom.Float64Atom: float,
tables.atom.Complex64Atom: complex,
tables.atom.Complex128Atom: complex,
tables.atom.StringAtom: str,
tables.atom.Time32Atom: int,
tables.atom.Time64Atom: float,
numpy.int32: int }
[docs]def get_h5file(typename, datapath=None):
"""
Load or create an HDF5 data file.
"""
# Where to save the HDF5 data file?
if datapath is None:
datapath = tempfile.mkdtemp()
logger.debug('Generated datapath {0}.'.format(datapath))
this_uuid = uuid.uuid4() # Unique identifier for this H5File
# Load or create HDF5 repository.
if datapath.split('.')[-1] == 'h5': # Path to file specified.
path = datapath
else: # Generate a new path.
logger.debug('H5File has UUID {0}.'.format(this_uuid))
path = '{0}/{1}-{2}.h5'.format(datapath, typename, this_uuid)
# mode = 'a' will create a new file if no file exists.
if os.path.exists(path):
h5file = tables.openFile(path, mode='a')
else:
title = '{0}-{1}'.format(typename, this_uuid)
h5file = tables.openFile(path, mode='a', title=title)
return h5file, path, uuid
[docs]def get_or_create_group(h5file, name, where=None):
if where is None:
if '/' + name not in h5file:
group = h5file.createGroup('/', name)
else:
group = h5file.getNode('/' + name)
else:
if name not in where:
group = h5file.createGroup(where, name)
else:
group = h5file.getNode(where, name)
return group
[docs]def get_or_create_table(h5file, group, name, model):
if name not in group:
table = h5file.createTable(group, name, model)
else:
table = h5file.getNode(group, name)
return table
[docs]def get_or_create_array(h5file, group, name, values):
if name not in group:
array = h5file.create_array(group, name, values)
else:
array = h5file.getNode(group, name)
return array
class FieldIndex(tables.IsDescription):
"""
For storing metadata about fields.
"""
name = tables.StringCol(100) # name of a field.
type = tables.StringCol(100) # should correspond to a Python dtype.
class HDF5Paper(tables.IsDescription):
"""
Provides persistence for :class:`.Paper` within a
:class:`.HDF5Corpus`\.
"""
mindex = tables.StringCol(100)
aulast = tables.StringCol(1000)
auinit = tables.StringCol(1000)
atitle = tables.StringCol(200)
jtitle = tables.StringCol(200)
volume = tables.StringCol(6)
issue = tables.StringCol(6)
spage = tables.StringCol(6)
epage = tables.StringCol(6)
ayjid = tables.StringCol(200)
doi = tables.StringCol(100)
pmid = tables.StringCol(100)
wosid = tables.StringCol(100)
abstract = tables.StringCol(5000)
accession = tables.StringCol(100)
date = tables.Int32Col()
citations = tables.StringCol(5000) # List of citation keys.
class Index(tables.IsDescription):
"""
For storing int : str pairs.
"""
i = tables.Int32Col()
mindex = tables.StringCol(1000)
class IntIndex(tables.IsDescription):
"""
For storing int : int pairs.
"""
i = tables.Int32Col()
mindex = tables.Int32Col()
class StrIndex(tables.IsDescription):
"""
For storing str : str pairs.
"""
i = tables.StringCol(100)
mindex = tables.StringCol(100000)
[docs]class HDF5Axes(dict):
"""
Organizes axes.
"""
def __init__(self, h5file):
logger.debug('Initialize HDF5Axes.')
self.h5file = h5file
# Load or create axes group.
if '/axes' not in self.h5file:
self.group = self.h5file.createGroup('/', 'axes')
else:
self.group = self.h5file.getNode('/axes')
def __setitem__(self, key, value):
logger.debug('key {0}'.format(key))
if type(value.values()[0][0]) is str:
keyatom = tables.StringAtom(200)
elif type(value.values()[0][0]) is int:
keyatom = tables.Int32Atom()
logger.debug('keyatom: {0}'.format(keyatom))
valuesatom = tables.StringAtom(200)
dict.__setitem__(self, key,
vlarray_dict(self.h5file, self.group, key, valuesatom, keyatom) )
for k,v in value.iteritems():
self[key][k] = v
def __getitem__(self, key):
try:
return dict.__getitem__(self, key)
except KeyError:
dict.__setitem__(self, key,
vlarray_dict(self.h5file, self.group, key))
return dict.__getitem__(self, key)
def __len__(self):
return len(self.group._f_listNodes())/2
[docs]class HDF5Axis(dict):
"""
Organizes a single axis.
"""
def __init__(self, h5file, fgroup, name):
self.h5file = h5file
self.name = name
# Load or create group.
if name not in fgroup:
self.group = self.h5file.createGroup(fgroup, name)
else:
self.group = self.h5file.getNode(fgroup, name)
def __setitem__(self, key, value):
name = '{0}_{1}'.format(self.name, key)
h5dict = HDF5ArrayDict(self.h5file, self.group, name, value)
dict.__setitem__(self, key, h5dict)
def __len__(self):
return len(self.group._f_listNodes())
[docs]class HDF5Features(dict):
"""
Organizes feature-sets, each as a :class:`.HDF5Feature`\.
"""
def __init__(self, h5file):
logger.debug('Initialize HDF5Features.')
self.h5file = h5file
# Load or create features group.
self.group = get_or_create_group(self.h5file, 'features')
def __setitem__(self, key, value):
logger.debug('HDF5Features.___setitem__ for key {0}.'.format(key))
fset = HDF5FeatureSet(self.h5file, self.group, key)
dict.__setitem__(self, key, fset)
logger.debug('assign values for key {0}.'.format(key))
for k,v in value.iteritems():
self[key][k] = v
def __getitem__(self, key):
try:
return dict.__getitem__(self, key)
except KeyError:
fset = HDF5FeatureSet(self.h5file, self.group, key)
dict.__setitem__(self, key, fset)
return dict.__getitem__(self, key)
[docs]class HDF5FeatureSet(dict):
"""
Stores data about the distribution of a specific feature-set, e.g. unigrams,
across papers in the :class:`.Corpus`\.
"""
def __init__(self, h5file, fgroup, name):
logger.debug('Initializing HDF5Feature with name {0}.'.format(name))
self.h5file = h5file
# Load or create a group.
self.group = get_or_create_group(self.h5file, name, fgroup)
self.name = name
dict.__setitem__(self, 'features', HDF5FeatureValues(h5file, self.group, 'features'))
logger.debug('...done.')
def __setitem__(self, key, value):
logger.debug('HDF5Feature ({0}): __setitem__ for key {1}, length {2}.'
.format(self.name, key, len(value)))
if key not in self:
logger.debug('{0} does not exist, creating...'.format(key))
if key == 'features':
dict.__setitem__(self, key,
HDF5FeatureValues( self.h5file, self.group, key ) )
elif key == 'papers':
dict.__setitem__(self, key,
HDF5FeatureValues( self.h5file, self.group, key, keyatom=tables.Int32Atom(),
indexatom=tables.StringAtom(200)) )
for k,v in value.iteritems():
self[key][k] = v
else:
values = numpy.array([ value[k] for k in sorted(value.keys()) ])
dict.__setitem__(self, key,
HDF5ArrayDict( self.h5file, self.group, key, values ) )
else:
logger.debug('setting values for {0}...'.format(key))
for k,v in value.iteritems():
self[key][k] = v
def __getitem__(self, key):
try:
return dict.__getitem__(self, key)
except KeyError:
if key == 'features':
dict.__setitem__(self, key,
HDF5FeatureValues( self.h5file, self.group, key ) )
elif key == 'papers':
dict.__setitem__(self, key,
HDF5FeatureValues( self.h5file, self.group, key, keyatom=tables.Int32Atom(),
indexatom=tables.StringAtom(200)) )
else:
dict.__setitem__(self, key,
HDF5ArrayDict( self.h5file, self.group, key, [] ) )
return dict.__getitem__(self, key)
[docs]class HDF5ArrayDict(dict):
def __init__(self, h5file, group, name, values):
self.h5file = h5file
self.group = group
self.name = name
# Load or create a new array.
self.array = get_or_create_array(self.h5file, self.group, name, values)
# Prime values.
for i in xrange(len(self.array)):
self.__getitem__(i)
def __setitem__(self, key, value):
self.array[key] = value
dict.__setitem__(self, key, value)
def __getitem__(self, key):
try:
return dict.__getitem__(self, key)
except KeyError:
value = self.array[key]
dict.__setitem__(self, key, value)
return value
[docs] def items(self):
return { i:self.array[i] for i in xrange(len(self.array)) }.items()
[docs] def iteritems(self):
i = 0
while i < len(self.array):
yield i, self.array[i]
i += 1
def __len__(self):
return len(self.array)
[docs] def values(self):
return [ self.array[i] for i in xrange(len(self.array)) ]
[docs] def keys(self):
return range(len(self.array))
[docs]class HDF5SparseValues(dict):
"""
Parameters
----------
h5file
group
name
iatom
katom
indexatom
"""
def __init__(self, h5file, group, name, iatom, katom, indexatom):
logger.debug('initialize HDF5SparseValues for {0}'.format(name))
self.h5file = h5file
self.group = get_or_create_group(self.h5file, name, group)
# For sparse arrays like index:[ (i1,k1), (i2,k2) ... (iN,kN) ], self.I
# holds a vlarray index:[ i1,i2,...iN ], and self.K holds a vlarray
# index:[ k1,k2,...kN ].
self.I = vlarray_dict( h5file, self.group,
'{0}_I'.format(name),
iatom, indexatom )
self.K = vlarray_dict( h5file, self.group,
'{0}_K'.format(name),
katom, indexatom )
def __setitem__(self, key, value):
# Split sparse vector [ (i1,k1), (i2,k2) ... (iN,kN) ] into constituents
# [ i1,i2,...iN ] and [ k1,k2,...kN ].
try:
I_values, K_values = zip(*value)
except: # OK to store empty vectors.
I_values = []
K_values = []
# Store the constituent vectors separately.
self.I[key] = I_values
self.K[key] = K_values
def __getitem__(self, key):
# Load constituent vectors [ i1,i2,...iN ] and [ k1,k2,...kN ], and
# reconstruct sparse vector [ (i1,k1), (i2,k2) ... (iN,kN) ]
I_values = self.I[key]
K_values = self.K[key]
return zip(I_values, K_values)
def __len__(self):
return len(self.I)
[docs] def iteritems(self):
i = 0
keys = self.I.keys()
while i < len(self.I):
yield keys[i], self.__getitem__(keys[i])
i += 1
[docs]class HDF5FeatureValues(dict):
def __init__(self, h5file, group, name, keyatom=tables.StringAtom(200),
indexatom=tables.Int32Atom(),
valueatom=tables.Float64Atom()):
self.h5file = h5file
self.group = get_or_create_group(self.h5file, name, group)
self.indices = vlarray_dict( self.h5file, self.group,
'indices', indexatom, keyatom )
self.values = vlarray_dict( self.h5file, self.group,
'values', valueatom, keyatom )
def __setitem__(self, key, value):
i,v = zip(*value)
self.indices[key] = list(i)
self.values[key] = list(v)
def __getitem__(self, key):
i = self.indices[key]
v = self.values[key]
value = zip(i,v)
return value
def __len__(self):
return len(self.indices)
[docs] def iteritems(self):
N = self.__len__()
keys = self.indices.keys()
i = 0
while i < N:
yield keys[i], self.__getitem__(keys[i])
i += 1
[docs]class papers_table(dict):
"""
Mimics the `papers` dict in :class:`.Paper`\, providing HDF5 persistence.
Values should be set only once for a key.
Parameters
----------
h5file : tables.file.File
A :class:`tables.file.File` object.
index_by : str
Key in :class:`.Paper` used to index papers in this
:class:`.Corpus`\.
"""
def __init__(self, h5file, index_by, name, citations=None,
index_citation_by='ayjid'):
self.h5file = h5file
self.index_by = index_by
# Load or create group.
if '/{0}'.format(name) not in self.h5file:
self.group = self.h5file.createGroup("/", name)
else:
self.group = self.h5file.getNode('/{0}'.format(name))
# Load or create table.
if 'papers_table' not in self.group:
self.table = self.h5file.createTable(self.group, 'papers_table',
HDF5Paper)
self.indexrows = self.table.cols.mindex.createIndex()
else:
self.table = self.h5file.getNode(self.group, 'papers_table')
self.citations = citations
self.index_citation_by = index_citation_by
def _recast_value(self, to, value):
"""
Recasts values from HDF5 table as Python types.
"""
if to is list and type(value) is not list: # avoid re-unpickling.
if value is None or len(value) == 0:
v = []
else:
v = pickle.loads(str(value))
else:
v = to(value)
return v
def _to_paper(self, hdf5paper):
"""
Yields a :class:`.Paper` from an :class:`.HDF5Paper`\.
"""
paper = Paper()
keys = self.table.description._v_dtypes.keys()
for kname in keys:
if kname == 'mindex': continue # Not a valid field for Paper.
if kname in paper.list_fields: to = list
elif kname in paper.string_fields: to = str
elif kname in paper.int_fields: to = int
v = self._recast_value(to, hdf5paper[kname])
if kname == 'citations': # 'citations' should contain Papers.
if self.citations is None: continue # May not have any.
try:
v = [ self._to_paper(self.citations[a]) for a in v ]
except IndexError:
v = []
paper[kname] = v
return paper
def __setitem__(self, key, value):
hpaper = self.table.row
hpaper['mindex'] = key
for k, v in value.iteritems():
if k in self.table.cols.__dict__:
if v is not None:
# Lists will be pickled for storage.
if type(v) is list:
# Citations will be stored as list of citation indices.
if k == 'citations':
if self.citations is None: continue
try:
v = [ a[self.index_citation_by] for a in v ]
except IndexError:
v = []
v = pickle.dumps(v)
hpaper[k] = v
hpaper.append()
self.table.flush() # We need table completely up-to-date for subsequent
# operations.
def __getitem__(self, key):
return [ self._to_paper(x) for x
in self.table.where('mindex == b"{0}"'.format(key)) ][0]
def __len__(self):
return len([ r for r in self.table])
def __contains__(self, key):
size = len([ self._to_paper(x) for x
in self.table.where('mindex == b"{0}"'.format(key)) ])
return size > 0
[docs] def iteritems(self):
i = 0
while i < len(self.table):
x = self.table[i]
yield x['mindex'],self._to_paper(x)
i += 1
[docs] def values(self):
return [ self._to_paper(x) for x in self.table ]
[docs] def keys(self):
return [ x['mindex'] for x in self.table ]
[docs]class vlarray_dict(dict):
"""
Provides dict-like access to an HDF5 VLArray.
"""
def __init__(self, h5file, group, name, atom, keyatom):
logger.debug('initialize vlarray_dict with name {0}'.format(name))
self.h5file = h5file
self.group = group
self.name = name
self.atom = atom
self.keyatom = keyatom
try:
self.pytype = pytype[type(atom)]
except KeyError:
raise NotImplementedError('No equivalent Python type for atom.')
# Load or create vlarray.
if self.name not in self.group:
logger.debug('no node for VLArray {0} in group {1}, creating...'
.format(self.name, self.group))
self.vlarray = self.h5file.createVLArray( self.group,
self.name,
self.atom )
# Pad with dummy data.
if self.atom.shape == ():
self.vlarray.append([self.atom.dflt])
else:
size = numpy.zeros(shape=self.atom.shape).size
padding = numpy.array([ self.atom.dflt for d in xrange(size)]) \
.reshape(self.atom.shape)
self.vlarray.append(padding)
else:
logger.debug('found node for VLArray {0} in group {1}, loading...'
.format(self.name, self.group))
self.vlarray = self.h5file.getNode(self.group, self.name)
self.atom = self.vlarray.atom
indexname = '{0}_keys'.format(self.name)
# If create, pads with dummy index at 0.
if indexname not in self.group:
self.I = self.h5file.createEArray( self.group,
indexname,
keyatom, (0,) )
if keyatom.dflt == 0: dflt = -1
if keyatom.dflt == 0.0: dflt = -1.
else: dflt = keyatom.dflt
self.I.append([dflt])
else:
self.I = self.h5file.getNode(self.group, indexname)
self.keyatom = self.I.atom
def __setitem__(self, key, value):
if key not in self:
self.I.append([key])
self.vlarray.append([ v for v in value ])
dict.__setitem__(self, key, value)
def __getitem__(self, key):
try:
i = list(self.I.read())[1:].index(key)
data = self.vlarray.read()[1:]
except ValueError:
raise KeyError()
try:
return dict.__getitem__(self, key)
except KeyError:
if self.atom.shape != ():
value = numpy.array([ [ self.pytype(v) for v in d ]
for d in data[i] ])
else:
value = numpy.array([ self.pytype(v) for v in data[i] ])
dict.__setitem__(self, key, value)
return value
def __contains__(self,key):
return key in self.keys()
[docs] def values(self):
values = [ self[i] for i in self.I.read()[1:] ]
return values
[docs] def keys(self):
return self.I.read()[1:]
def __len__(self):
return len(self.vlarray) -1 # Compensate for padding.
def __str__(self):
return str(self.items())
[docs] def items(self):
return zip(self.keys(), self.values())
[docs] def iteritems(self):
return iter(zip(self.keys(), self.values()))