SciPy
Need help? Have a feature request? Please check out the tethne-users group .

Source code for tethne.readers

"""
Methods for parsing bibliographic datasets.

.. autosummary::

   merge
   dfr
   wos
   zotero
   scopus

Each module in :mod:`tethne.readers` provides a ``read`` function that yields
a :class:`.Corpus` instance.

"""

from tethne import Paper, Corpus

[docs]class DataError(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value)
# TODO: merge FeatureSets.
[docs]def merge(corpus_1, corpus_2, match_by=['ayjid'], match_threshold=1., index_by='ayjid'): """ Combines two :class:`.Corpus` instances. The default behavior is to match :class:`.Paper`\s using the fields in ``match_by``\. If several fields are specified, ``match_threshold`` can be used to control how well two :class:`.Paper`\s must match to be combined. Alternatively, ``match_by`` can be a callable object that accepts two :class:`.Paper` instances, and returns bool. This allows for more complex evaluations. Where two matched :class:`.Paper`\s have values for the same field, values from the :class:`.Paper` instance in ``corpus_1`` will always be preferred. Parameters ---------- corpus_1 : :class:`.Corpus` Values from this :class:`.Corpus` will always be preferred in cases of conflict. corpus_2 : :class:`.Corpus` match_by : list or callable Either a list of fields used to evaluate whether or not two :class:`.Paper`\s should be combined, **OR** a callable that accepts two :class:`.Paper` instances and returns bool. match_threshold : float if ``match_by`` is a list containing more than one field, specifies the proportion of fields that must match for two :class:`.Paper` instances to be combined. index_by : str The field to use as the primary indexing field in the new :class:`.Corpus`\. Default is `ayjid`, since this is virtually always available. Returns ------- combined : :class:`.Corpus` Examples -------- .. code-block:: python >>> from tethne.readers import wos, dfr, merge >>> wos_corpus = wos.read("/Path/to/data1.txt") >>> dfr_corpus = dfr.read("/Path/to/DfR") >>> corpus = merge(wos_corpus, dfr_corpus) """ def norm(value): if type(value) in [str, unicode]: return value.strip().lower() return value combined = [] exclude_1 = [] exclude_2 = [] # Attempt to match Papers for paper_1 in corpus_1: for paper_2 in corpus_2: # The user can provide their own matching logic. In this case, # match_threshold is ignored. if callable(match_by): match = match_by(paper_1, paper_2) # Otherwise we match using the fields in ``match_by``. else: matches = 0. for field in match_by: if hasattr(paper_1, field) and hasattr(paper_2, field): value_1 = norm(getattr(paper_1, field)) value_2 = norm(getattr(paper_2, field)) if value_1 == value_2: matches += 1. match = matches/len(match_by) >= match_threshold # Not every field needs to match precisely; if match: paper_new = Paper() # We add values from paper_2 first, so that... for key, value in paper_2.__dict__.iteritems(): if value not in ['', [], None]: paper_new[key] = value # ...values from paper_1 will override values from paper_2. for key, value in paper_1.__dict__.iteritems(): if value not in ['', [], None]: paper_new[key] = value # We assemble all papers before creating a new Corpus, so that # indexing happens all in one shot. combined.append(paper_new) # Flag matched papers for exclusion. exclude_1.append(corpus_1._generate_index(paper_1)) exclude_2.append(corpus_2._generate_index(paper_2)) # Include papers that were not matched. combined += [paper for paper in corpus_1 if corpus_1._generate_index(paper) not in exclude_1] combined += [paper for paper in corpus_2 if corpus_2._generate_index(paper) not in exclude_2] # Here indexing happens all at once, with the new ``index_by`` field. corpus = Corpus(combined, index_by=index_by) featuresets = {} for featureset_name, featureset_1 in corpus_1.features.iteritems(): # We avoid FeatureSets that were generated during the indexing process # (e.g. 'citations', 'authors'). if featureset_name in featuresets or featureset_name in corpus.features: continue features = {} # Can be FeatureSet or StructuredFeatureSet. fclass = type(featureset_1) if featureset_name in corpus_2.features: featureset_2 = corpus_2.features[featureset_name] for index, feature in featureset_2.iteritems(): features[getattr(corpus_2[index], index_by)] = feature # Features from corpus_1 will be preferred over those from corpus_2. for index, feature in featureset_1.iteritems(): features[getattr(corpus_1[index], index_by)] = feature featuresets[featureset_name] = fclass(features) # FeatureSets unique to corpus_2. for featureset_name, featureset_2 in corpus_2.features.iteritems(): # We avoid FeatureSets that were generated during the indexing process # (e.g. 'citations', 'authors'). if featureset_name in featuresets or featureset_name in corpus.features: continue features = {} # Can be FeatureSet or StructuredFeatureSet. fclass = type(featureset_2) for index, feature in featureset_2.iteritems(): features[getattr(corpus_2[index], index_by)] = feature featuresets[featureset_name] = fclass(features) corpus.features.update(featuresets) return corpus