Source code for tethne.readers
"""
Methods for parsing bibliographic datasets.
.. autosummary::
merge
dfr
wos
zotero
scopus
Each module in :mod:`tethne.readers` provides a ``read`` function that yields
a :class:`.Corpus` instance.
"""
from tethne import Paper, Corpus
[docs]class DataError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
# TODO: merge FeatureSets.
[docs]def merge(corpus_1, corpus_2, match_by=['ayjid'], match_threshold=1.,
index_by='ayjid'):
"""
Combines two :class:`.Corpus` instances.
The default behavior is to match :class:`.Paper`\s using the fields in
``match_by``\. If several fields are specified, ``match_threshold`` can be
used to control how well two :class:`.Paper`\s must match to be combined.
Alternatively, ``match_by`` can be a callable object that accepts two
:class:`.Paper` instances, and returns bool. This allows for more complex
evaluations.
Where two matched :class:`.Paper`\s have values for the same field, values
from the :class:`.Paper` instance in ``corpus_1`` will always be preferred.
Parameters
----------
corpus_1 : :class:`.Corpus`
Values from this :class:`.Corpus` will always be preferred in cases of
conflict.
corpus_2 : :class:`.Corpus`
match_by : list or callable
Either a list of fields used to evaluate whether or not two
:class:`.Paper`\s should be combined, **OR** a callable that accepts
two :class:`.Paper` instances and returns bool.
match_threshold : float
if ``match_by`` is a list containing more than one field, specifies the
proportion of fields that must match for two :class:`.Paper` instances
to be combined.
index_by : str
The field to use as the primary indexing field in the new
:class:`.Corpus`\. Default is `ayjid`, since this is virtually always
available.
Returns
-------
combined : :class:`.Corpus`
Examples
--------
.. code-block:: python
>>> from tethne.readers import wos, dfr, merge
>>> wos_corpus = wos.read("/Path/to/data1.txt")
>>> dfr_corpus = dfr.read("/Path/to/DfR")
>>> corpus = merge(wos_corpus, dfr_corpus)
"""
def norm(value):
if type(value) in [str, unicode]:
return value.strip().lower()
return value
combined = []
exclude_1 = []
exclude_2 = []
# Attempt to match Papers
for paper_1 in corpus_1:
for paper_2 in corpus_2:
# The user can provide their own matching logic. In this case,
# match_threshold is ignored.
if callable(match_by):
match = match_by(paper_1, paper_2)
# Otherwise we match using the fields in ``match_by``.
else:
matches = 0.
for field in match_by:
if hasattr(paper_1, field) and hasattr(paper_2, field):
value_1 = norm(getattr(paper_1, field))
value_2 = norm(getattr(paper_2, field))
if value_1 == value_2:
matches += 1.
match = matches/len(match_by) >= match_threshold
# Not every field needs to match precisely;
if match:
paper_new = Paper()
# We add values from paper_2 first, so that...
for key, value in paper_2.__dict__.iteritems():
if value not in ['', [], None]:
paper_new[key] = value
# ...values from paper_1 will override values from paper_2.
for key, value in paper_1.__dict__.iteritems():
if value not in ['', [], None]:
paper_new[key] = value
# We assemble all papers before creating a new Corpus, so that
# indexing happens all in one shot.
combined.append(paper_new)
# Flag matched papers for exclusion.
exclude_1.append(corpus_1._generate_index(paper_1))
exclude_2.append(corpus_2._generate_index(paper_2))
# Include papers that were not matched.
combined += [paper for paper in corpus_1
if corpus_1._generate_index(paper) not in exclude_1]
combined += [paper for paper in corpus_2
if corpus_2._generate_index(paper) not in exclude_2]
# Here indexing happens all at once, with the new ``index_by`` field.
corpus = Corpus(combined, index_by=index_by)
featuresets = {}
for featureset_name, featureset_1 in corpus_1.features.iteritems():
# We avoid FeatureSets that were generated during the indexing process
# (e.g. 'citations', 'authors').
if featureset_name in featuresets or featureset_name in corpus.features:
continue
features = {}
# Can be FeatureSet or StructuredFeatureSet.
fclass = type(featureset_1)
if featureset_name in corpus_2.features:
featureset_2 = corpus_2.features[featureset_name]
for index, feature in featureset_2.iteritems():
features[getattr(corpus_2[index], index_by)] = feature
# Features from corpus_1 will be preferred over those from corpus_2.
for index, feature in featureset_1.iteritems():
features[getattr(corpus_1[index], index_by)] = feature
featuresets[featureset_name] = fclass(features)
# FeatureSets unique to corpus_2.
for featureset_name, featureset_2 in corpus_2.features.iteritems():
# We avoid FeatureSets that were generated during the indexing process
# (e.g. 'citations', 'authors').
if featureset_name in featuresets or featureset_name in corpus.features:
continue
features = {}
# Can be FeatureSet or StructuredFeatureSet.
fclass = type(featureset_2)
for index, feature in featureset_2.iteritems():
features[getattr(corpus_2[index], index_by)] = feature
featuresets[featureset_name] = fclass(features)
corpus.features.update(featuresets)
return corpus