SciPy

Source code for tethne.serialize.paper

"""
This class has functionalities to serialize a TETHNE corpus object  to persist in the database;

.. code-block:: python
>>> from tethne.serialize import paper

"""

import os
import json
import re

from tethne.dao import tethnedao
from time import gmtime, strftime
from tethne.utilities import _strip_punctuation


[docs]class SerializeUtility: @staticmethod
[docs] def get_auth_inst(address): if ']' not in address: return address, None institute_literal = address.rsplit(']', 1)[1].lstrip() author_literal = address.rsplit(']', 1)[0].replace('[', '') authors = author_literal.split(';') return institute_literal, authors
[docs]class Serialize: """ This class is used to serialize the Corpus object and has methods to create fixtures(JSON files) for different models in the TETHNE database. """ paper_source_map = {3: 'wosid', 2: 'url', 1: 'doi'} def __init__(self, corpus, source): """ Parameters ---------- corpus - The Corpus object that is to be serialized. source - This denotes the Source of the corpus object. The possible values can be 1 for JSTOR, 2 for ZOTERO, 3 for WOS and 4 for Scopus The source value helps us in deciding, what attribute to fetch the paper_id from? i.e. DOI in case of JSTOR, URL in case of ZOTERO and WOSID in case of WOS. Following are the parameters which are initialized. >>> self.corpus_id = tethnedao.getMaxCorpusID()+1 #this sets the new primary key which is (max PK +1) Following are the hashMaps which are initialized and will be used for foreign key references. >>> self.paperIdMap ={} >>> self.paperIdMap[paper.wosid] = pid # This helps in keeping track of the Primary keys Similarly, we have the following for Authors and AuthorsInstances. >>> self.authorIdMap = {} >>> self.authorInstanceIdMap = {} Returns ------- """ self.corpus = corpus self.source = source self.corpus_id = tethnedao.getMaxCorpusID()+1 self.paperIdMap = {} self.authorIdMap = {} self.authorInstanceIdMap = {} self.citationIdMap = {} self.citationInstanceMap = {} self.instituteIdMap = {} self.instituteIdInstanceMap = {}
[docs] def serializeCorpus(self): """ This method creates a fixture for the "django-tethne_corpus" model. Returns ------- corpus_details in JSON format which can written to a file. """ corpus_details = [{ "model": "django-tethne.corpus", "pk": self.corpus_id, "fields": { "source": self.source, "date_created":strftime("%Y-%m-%d %H:%M:%S", gmtime()), "length" : len(self.corpus), } }] return corpus_details
[docs] def serializePaper(self): """ This method creates a fixture for the "django-tethne_paper" model. Returns ------- paper_details in JSON format, which can written to a file. """ pid = tethnedao.getMaxPaperID(); papers_details = [] for paper in self.corpus: pid = pid + 1 paper_key = getattr(paper, Serialize.paper_source_map[self.source]) self.paperIdMap[paper_key] = pid paper_data = { "model": "django-tethne.paper", "pk": self.paperIdMap[paper_key], "fields": { "paper_id": paper_key, "corpus":self.corpus_id, "pub_date": getattr(paper, 'date', ''), "volume": getattr(paper, 'volume', ''), "title": getattr(paper, 'title', ''), "abstract": getattr(paper, 'abstract', ''), } } papers_details.append(paper_data) return papers_details
[docs] def serializeAuthors(self): """ This method creates a fixture for the "django-tethne_author" model. Returns ------- Author details in JSON format, which can be written to a file. """ author_details = [] auid = tethnedao.getMaxAuthorID() for val in self.corpus.features['authors'].index.values(): auid = auid + 1 self.authorIdMap[val[1]+val[0]] = auid instanceData = { "model": "django-tethne.author", "pk": auid, "fields":{ "first_name": val[1], "last_name": val[0], } } author_details.append(instanceData) return author_details
[docs] def serializeAuthorInstances(self): """ This method creates a fixture for the "django-tethne_author" model. Returns ------- Author Instance details which can be written to a file """ author_instance_details = [] au_instanceid = tethnedao.getMaxAuthorInstanceID() for paper in self.corpus: paper_key = getattr(paper, Serialize.paper_source_map[self.source]) for author in paper.authors: au_instanceid = au_instanceid + 1 author_key = author[0][1] + author[0][0] author_instance_key = paper_key + author_key self.authorInstanceIdMap[author_instance_key] = au_instanceid instance_data = { "model": "django-tethne.author_instance", "pk": au_instanceid, "fields": { "paper": self.paperIdMap[paper_key], "author": self.authorIdMap[author_key], "first_name": author[0][1], "last_name":author[0][0], } } author_instance_details.append(instance_data) # Commented the following, as we do not have author identity detail now. """ identity_data = { "model": "django-tethne.author_identity", "fields" :{ "author": self.authorIdMap[author[0][1]+author[0][0]], "author_instance": self.authorInstanceIdMap[paper.wosid+author[0][1]+author[0][0]], "confidence": 0.0, } } author_identity.append(identity_data) """ return author_instance_details
[docs] def serializeCitation(self): """ This method creates a fixture for the "django-tethne_citation" model. Returns ------- citation details which can be written to a file """ citation_details = [] citation_id = tethnedao.getMaxCitationID() for citation in self.corpus.features['citations'].index.values(): date_match = re.search(r'(\d+)', citation) if date_match is not None: date = date_match.group(1) if date_match is None: date_match = re.search(r"NONE", citation) date = date_match.group() first_author = citation.replace('_', ' ').split(date)[0].rstrip() journal = citation.replace('_', ' ').split(date)[1].lstrip() citation_key = citation if citation_key not in self.citationIdMap: citation_id += 1 self.citationIdMap[citation_key] = citation_id citation_data = { "model": "django-tethne.citation", "pk": citation_id, "fields": { "literal": citation, "journal": journal, "first_author": first_author, "date": date } } citation_details.append(citation_data) return citation_details
[docs] def serializeCitationInstance(self): """ This method creates a fixture for the "django-tethne_citation_instance" model. Returns ------- citation Instance details which can be written to a file """ citation_instance_data = [] citation_instance_id = tethnedao.getMaxCitationInstanceID() for paper in self.corpus: paper_key = getattr(paper, Serialize.paper_source_map[self.source]) for citation_list in paper.citations: citation = citation_list[0] citation_key = citation citation_instance_id += 1 date_match = re.search(r'(\d+)', citation) if date_match is not None: date = date_match.group(1) if date_match is None: date_match = re.search(r"NONE", citation) date = date_match.group() first_author = citation.replace('_', ' ').split(date)[0].rstrip() journal = citation.replace('_', ' ').split(date)[1].lstrip() citation_instance_details = { "model": "django-tethne.citation_instance", "pk": citation_instance_id, "fields": { "literal": citation, "citation": self.citationIdMap[citation_key], "paper": self.paperIdMap[paper_key], "journal": journal, "first_author": first_author, "date": date } } citation_instance_data.append(citation_instance_details) return citation_instance_data
[docs] def serializeInstitution(self): """ This method creates a fixture for the "django-tethne_citation_institution" model. Returns ------- institution details which can be written to a file """ institution_data = [] institution_instance_data = [] affiliation_data = [] affiliation_id = tethnedao.getMaxAffiliationID() institution_id = tethnedao.getMaxInstitutionID() institution_instance_id = tethnedao.getMaxInstitutionInstanceID() for paper in self.corpus: if hasattr(paper, 'authorAddress'): paper_key = getattr(paper, Serialize.paper_source_map[self.source]) if type(paper.authorAddress) is unicode: institution_id += 1 institution_instance_id += 1 institute_literal, authors = SerializeUtility.get_auth_inst(paper.authorAddress) institute_row, institute_instance_row = self.get_details_from_inst_literal(institute_literal, institution_id, institution_instance_id, paper_key) if institute_row: institution_data.append(institute_row) institution_instance_data.append(institute_instance_row) if authors: for author in authors: affiliation_id += 1 affiliation_row = self.get_affiliation_details(author, affiliation_id, institute_literal) affiliation_data.append(affiliation_row) elif type(paper.authorAddress) is list: for address in paper.authorAddress: institution_id += 1 institution_instance_id += 1 institute_literal, authors = SerializeUtility.get_auth_inst(address) institute_row, institute_instance_row = self.get_details_from_inst_literal(institute_literal, institution_id, institution_instance_id, paper_key) if institute_row: institution_data.append(institute_row) institution_instance_data.append(institute_instance_row) if authors is None: authors = prevAuthors for author in authors: affiliation_id += 1 affiliation_row = self.get_affiliation_details(author, affiliation_id, institute_literal) affiliation_data.append(affiliation_row) prevAuthors = authors return institution_data, institution_instance_data, affiliation_data
[docs] def get_details_from_inst_literal(self, institute_literal, institution_id, institution_instance_id, paper_key): """ This method parses the institute literal to get the following 1. Department naame 2. Country 3. University name 4. ZIP, STATE AND CITY (Only if the country is USA. For other countries the standard may vary. So parsing these values becomes very difficult. However, the complete address can be found in the column "AddressLine1" Parameters ---------- institute_literal -> The literal value of the institute institution_id -> the Primary key value which is to be added in the fixture institution_instance_id -> Primary key value which is to be added in the fixture paper_key -> The Paper key which is used for the Institution Instance Returns ------- """ institute_details = institute_literal.split(',') institute_name = institute_details[0] country = institute_details[len(institute_details)-1].lstrip().replace('.', '') institute_row = None zipcode = "" state = "" city = "" if 'USA' in country: temp = country if(len(temp.split())) == 3: country = temp.split()[2] zipcode = temp.split()[1] state = temp.split()[0] elif(len(temp.split())) == 2: country = temp.split()[1] state = temp.split()[0] city = institute_details[len(institute_details)-2].lstrip() addressline1 = "" for i in range(1, len(institute_details)-1, 1): if i != len(institute_details)-2: addressline1 = addressline1 + institute_details[i]+',' else: addressline1 = addressline1 + institute_details[i] if institute_literal not in self.instituteIdMap: self.instituteIdMap[institute_literal] = institution_id institute_row = { "model": "django-tethne.institution", "pk": institution_id, "fields": { "institute_name": institute_name, "addressLine1": addressline1, "country": country, "zip": zipcode, "state": state, "city": city } } department = "" if re.search('Dept([^,]*),', institute_literal) is not None: department = re.search('Dept([^,]*),', institute_literal).group().replace(',', '') institute_instance_row = { "model": "django-tethne.institution_instance", "pk": institution_instance_id, "fields": { "institution": self.instituteIdMap[institute_literal], "literal": institute_literal, "institute_name": institute_name, "addressLine1": addressline1, "country": country, "paper": self.paperIdMap[paper_key], "department": department, "zip": zipcode, "state": state, "city": city } } return institute_row, institute_instance_row
[docs] def get_affiliation_details(self, value, affiliation_id, institute_literal): """ This method is used to map the Affiliation between an author and Institution. Parameters ---------- value - The author name affiliation_id - Primary key of the affiliation table institute_literal Returns ------- Affiliation details(JSON fixture) which can be written to a file """ tokens = tuple([t.upper().strip() for t in value.split(',')]) if len(tokens) == 1: tokens = value.split() if len(tokens) > 0: if len(tokens) > 1: aulast, auinit = tokens[0:2] else: aulast = tokens[0] auinit = '' else: aulast, auinit = tokens[0], '' aulast = _strip_punctuation(aulast).upper() auinit = _strip_punctuation(auinit).upper() author_key = auinit+aulast affiliation_row = { "model": "django-tethne.affiliation", "pk": affiliation_id, "fields": { "author": self.authorIdMap[author_key], "institution": self.instituteIdMap[institute_literal] } } return affiliation_row
[docs]def serialize(dirPath, corpus, source): """ Parameters ---------- dirPath - A valid directory path where you want the JSON files to be written corpus - The corpus object which is to be serialized. source - The source of the corpus The possible values can be 1 for JSTOR, 2 for ZOTERO and 3 for WOS 4 for Scopus Follwoing is an example to use the serialize method. This method will raise an exception if 1. dirPath is not a valid directory 2. If Corpus object is none or has no papers to serialize 3. The source is not a valid source. .. code-block:: python >>> from tethne.readers import wos >>> from tethne.serialize import paper >>> wosCorpus = wos.read('/path/to/my/Corpus.txt') >>> paper.serialize('/path/to/my/FixturesDir/', wosCorpus, 3) Returns ------- Writes the fixtures at the directory location. """ if not os.path.isdir(dirPath): raise IOError('No such file or directory') if corpus is None or len(corpus) < 1: raise NameError('The corpus object has no papers to serialize') if source not in (1, 2, 3, 4): raise ValueError("Not a valid source.") corpusfilePath = os.path.join(dirPath, "corpus.json") paperFilePath = os.path.join(dirPath, "paper.json") authorFilePath = os.path.join(dirPath, "authors.json") authorInstanceFilePath = os.path.join(dirPath, "authorInstance.json") citationsFilePath = os.path.join(dirPath, "citations.json") citationInstanceFilePath = os.path.join(dirPath, "citationInstance.json") institutionsFilePath = os.path.join(dirPath, "institutions.json") institutionInstanceFilePath = os.path.join(dirPath, "institutionInstance.json") affiliationsFilePath = os.path.join(dirPath, "affiliations.json") serializehandler = Serialize(corpus, source) corpusDetails = serializehandler.serializeCorpus() papersDetails = serializehandler.serializePaper() authorDetails = serializehandler.serializeAuthors() authorInstanceDetails = serializehandler.serializeAuthorInstances() citationDetails = serializehandler.serializeCitation() citationInstanceDetails = serializehandler.serializeCitationInstance() institutions, institutionsInstance, affilations = serializehandler.serializeInstitution() with open(corpusfilePath, mode='w+') as f: json.dump(corpusDetails, f) with open(paperFilePath, mode='w+') as f: json.dump(papersDetails, f) with open(authorFilePath, mode='w+') as f: json.dump(authorDetails, f) with open(authorInstanceFilePath, mode='w+') as f: json.dump(authorInstanceDetails, f) with open(citationsFilePath, mode='w+') as f: json.dump(citationDetails, f) with open(citationInstanceFilePath, mode='w+') as f: json.dump(citationInstanceDetails, f) with open(institutionsFilePath, mode='w+') as f: json.dump(institutions, f) with open(institutionInstanceFilePath, mode='w+') as f: json.dump(institutionsInstance, f) with open(affiliationsFilePath, mode='w+') as f: json.dump(affilations, f)