SciPy

Source code for tethne.readers.scopus

"""
Reader for Scopus CSV data files.
"""

import csv
import re
import os
import uuid

from unidecode import unidecode

import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
    unicode = str
    xrange = range

from tethne import Paper, Corpus

[docs]def read_corpus(path): """ """ papers = read(path) return Corpus(papers, index_by='eid')
[docs]def corpus_from_dir(path): """ Parameters ---------- path : string Path to directory of Scopus CSV data files. Returns ------- papers : list A list of :class:`.Paper` objects. """ papers = from_dir(path) return Corpus(papers, index_by='eid')
[docs]def from_dir(path): """ Convenience function for generating a list of :class:`.Paper` from a directory of Scopus CSV data files. Parameters ---------- path : string Path to directory of Scopus CSV data files. Returns ------- papers : list A list of :class:`.Paper` objects. Raises ------ IOError Invalid path. Examples -------- .. code-block:: python >>> from tethne.readers as rd >>> papers = rd.scopus.from_dir("/Path/to/datadir") """ papers = [] try: files = os.listdir(path) except IOError: raise IOError("Invalid path.") for f in files: if not f.startswith('.') and f.endswith('csv'): # Ignore hidden files. try: papers += read('{0}/{1}'.format(path, f)) except (IOError,UnboundLocalError): # Ignore files that don't pass # contain Scopus data. return papers
[docs]def read(datapath, **kwargs): """ Yields a list of :class:`.Paper` instances from a Scopus CSV data file. Parameters ---------- datapath : string Filepath to the Web of Science field-tagged data file. Returns ------- papers : list A list of :class:`.Paper` instances. Examples -------- .. code-block:: python >>> import tethne.readers as rd >>> papers = rd.scopus.read("/Path/to/scopus.csv") """ accession = unicode(uuid.uuid4()) # Accession ID. papers = kwargs.get('papers', []) # Can provide an alternate container. rawdata = [] with open(datapath, 'rb') as f: reader = csv.reader(f) for row in reader: rawdata.append([ unidecode(r.decode('UTF-8')) for r in row ]) headers = rawdata[0] # First row is header. for datum in rawdata[1:]: p = Paper() rawdatum = { headers[i]:datum[i] for i in xrange(len(headers)) } p['aulast'], p['auinit'] = _handle_authors( rawdatum['Authors'] ) p['date'] = int(rawdatum['Year'].strip()) p['ayjid'] = _create_ayjid( p['aulast'], p['auinit'], p['date'], p['jtitle']) p['institutions'] = _handle_affiliations( rawdatum['Authors with affiliations'], p['aulast'], p['auinit']) p['atitle'] = rawdatum['Title'].strip().upper() p['jtitle'] = rawdatum['Source title'].strip().upper() p['volume'] = rawdatum['Volume'].strip() p['issue'] = rawdatum['Issue'].strip() p['spage'] = rawdatum['Page start'].strip() p['epage'] = rawdatum['Page end'].strip() p['abstract'] = rawdatum['Abstract'].strip().upper() p['citations'] = _handle_references(rawdatum['References']) p['accession'] = accession # DOI and PMID are not always present, but must be unique. If they are # not provided, should be set to None so that they can be handled # appropriately downstream. EID should always be present. doi = rawdatum['DOI'].strip() if doi == '': doi = None p['doi'] = doi pmid = rawdatum['PubMed ID'].strip() if pmid == '': pmid = None p['pmid'] = pmid p['eid'] = rawdatum['EID'].strip() papers.append(p) return papers
def _handle_authors(authordata): aulast = [] auinit = [] for author in authordata.split(', '): try: a = [a_.strip().upper().replace('.','') for a_ in author.split()] aulast.append(' '.join(a[0:-1])) auinit.append(a[-1]) except IndexError: # Empty record; stray delimiter. pass return aulast, auinit def _handle_affiliations(affiliationsdata, aulast, auinit): def matching_author(last, init, instcleaned): au_parts = [last, init] matching = 0 for x in xrange(len(au_parts)): if au_parts[x] == instcleaned[x]: matching += 1 if matching > 1: return True, matching return False, 0 institutions = {} aff_split = affiliationsdata.split(';') A = len(aff_split) for i in xrange(A): aff = aff_split[i] a = aff.split(', ') try: cleaned = [ a_.strip().upper().replace('.','') for a_ in a ] # First work on the assumption that affiliations are in the same # order as author names. aul = aulast[i] aui = auinit[i] match, overlap = matching_author(aul, aui, cleaned) if match: aname = ' '.join([aul, aui]) # That might not always be true, so we'll try the other authors # just in case. else: found = False for x in xrange(len(aulast)): aul = aulast[x] aui = auinit[x] match, overlap = matching_author(aul, aui, cleaned) if match: aname = ' '.join([aul, aui]) found = True break if not found: # If the author can't be identified, discard. break institution = a[match+1:] # The remainder is the institution name. l = len(institution) if l == 0: continue if l == 1: nation = institution[0].split()[-1].upper() inst = ' '.join(institution[0].split()[0:-1]).upper() elif l == 2: nation = institution[1].upper() inst = institution[0].upper() else: nation = institution[-1].upper() inst = ', '.join(institution[0:2]).upper() institutions[aname] = [', '.join([inst, nation])] except IndexError: # Blank record part (stray delimiter). pass # inst_list should be a list of lists, ordered by aulast/auinit. authors = [ ' '.join(a) for a in zip(aulast, auinit) ] # To use as keys. inst_list = [] for au in authors: if au in institutions: # Data available. inst_list.append(institutions[au]) else: inst_list.append([]) # No data for that author. return inst_list def _create_ayjid(aulast=None, auinit=None, date=None, jtitle=None, **kwargs): """ Convert aulast, auinit, and jtitle into the fuzzy identifier ayjid Returns 'Unknown paper' if all id components are missing (None). Parameters ---------- Kwargs : dict A dictionary of keyword arguments. aulast : string Author surname. auinit: string Author initial(s). date : string Four-digit year. jtitle : string Title of the journal. Returns ------- ayj : string Fuzzy identifier ayjid, or 'Unknown paper' if all id components are missing (None). """ if aulast is None or len(aulast) == 0: aulast = '' elif isinstance(aulast, list): aulast = aulast[0] if auinit is None or len(auinit) == 0: auinit = '' elif isinstance(auinit, list): auinit = auinit[0] if date is None: date = '' if jtitle is None: jtitle = '' ayj = aulast + ' ' + auinit + ' ' + unicode(date) + ' ' + jtitle if ayj == ' ': ayj = 'Unknown paper' return ayj.upper() def _handle_references(referencesdata): """ Use a series of RegEx patterns to (roughly) parse references into :class:`.Paper` objects. """ references = [] # A list of Papers. for r in referencesdata.strip().split(';'): # Each cited reference. # Split at date first, e.g. ' ... (1995) ... ' m = re.search('(.*)\((?P<date>[0-9]{4})\)(.*)', r) if m is not None: date = int(m.group('date')) pre_date = m.group(1).strip() # Authors, title (unless it's a book). post_date = m.group(3).strip() # Journal, volume, pages, etc. # Handle authors and title. # Looks for one or more like 'Author, J.K.L., ' m_ = re.findall('([a-zA-Z]*,\s[A-Z\.]*,[\s]*)', pre_date) if m_ is not None: # Books will pass, but we won't use title. # Remove commas between surnames and initials. a_ref = ', '.join([ ' '.join(a.split(', ')) for a in m_ ]) aulast, auinit = _handle_authors(a_ref) # Remainder is title. l = len(''.join(m_)) title = pre_date[l:].upper().strip() else: # Let the rest go for now. continue # Handle journal, volume, issue, pages. m_ = re.search('(.*)[p]{1,2}\.(.*)', post_date) if m_ is not None: # Books will fail here. pages = m_.group(2).strip() pre_pp = m_.group(1) # Get start (and end) page number(s). if len(pages.split('-')) == 2: # Both start and end. spage = pages.split('-')[0] epage = pages.split('-')[1] else: # Only one page. spage = pages epage = pages # Get journal details. jtitle = pre_pp.split(', ')[0].strip().upper() issue_volume = pre_pp.split(', ')[1].strip() # Look for an issue number, in parentheses. j = re.search('([a-z0-9]*) \(([a-z0-9]*)\)', issue_volume) if j is not None: volume = j.group(1) issue = j.group(2) else: # No issue number. volume = issue_volume issue = '' else: # Probably a book. # TODO: Make this better. title = post_date.strip() # Includes publisher, etc. jtitle = '' issue = '' volume = '' spage = '' epage = '' else: # Let the rest go for now. continue # Populate the Paper object. p = Paper() p['atitle'] = title p['aulast'] = aulast p['auinit'] = auinit p['issue'] = issue p['date'] = date p['volume'] = volume p['jtitle'] = jtitle p['spage'] = spage p['epage'] = epage p['ayjid'] = _create_ayjid( p['aulast'], p['auinit'], p['date'], p['jtitle']) references.append(p) return references