"""
Reader for Web of Science field-tagged bibliographic data.
Tethne parses Web of Science field-tagged data into a list of :class:`.Paper`
objects. This is a two-step process: data are first parsed into a list of
dictionaries with field-tags as keys, and then each dictionary is converted to a
:class:`.Paper` . :func:`.readers.wos.read` performs both steps in sequence.
One-step Parsing
````````````````
The method :func:`.readers.wos.read` performs both :func:`.readers.wos.parse`
and :func:`.readers.wos.convert` . This is the preferred (simplest) approach in
most cases.
.. code-block:: python
>>> papers = rd.wos.read("/Path/to/savedrecs.txt")
>>> papers[0]
<tethne.data.Paper instance at 0x101b575a8>
Alternatively, if you have many data files saved in the same directory, you can
use :func:`.readers.wos.from_dir` :
.. code-block:: python
>>> papers = rd.wos.parse_from_dir("/Path/to")
Two-step Parsing
````````````````
Use the two-step approach if you need to access fields not included in
:class:`.Paper`\, or if you wish to perform some intermediate manipulation on
the raw parsed data.
First import the :mod:`.readers.wos` module:
.. code-block:: python
>>> import tethne.readers as rd
Then parse the WoS data to a list of field-tagged dictionaries using
:func:`.readers.wos.parse` :
.. code-block:: python
>>> wos_list = rd.wos.parse("/Path/to/savedrecs.txt")
>>> wos_list[0].keys()
['EM', '', 'CL', 'AB', 'WC', 'GA', 'DI', 'IS', 'DE', 'VL', 'CY', 'AU', 'JI',
'AF', 'CR', 'DT', 'TC', 'EP', 'CT', 'PG', 'PU', 'PI', 'RP', 'J9', 'PT',
'LA', 'UT', 'PY', 'ID', 'SI', 'PA', 'SO', 'Z9', 'PD', 'TI', 'SC', 'BP',
'C1', 'NR', 'RI', 'ER', 'SN']
Convert those field-tagged dictionaries to :class:`.Paper` objects using
:func:`.readers.wos.convert` :
.. code-block:: python
>>> papers = rd.wos.convert(wos_list)
>>> papers[0]
<tethne.data.Paper instance at 0x101b575a8>
Methods
```````
.. autosummary::
convert
from_dir
parse
read
"""
import xml.etree.ElementTree as ET
from ..utilities import *
from ..classes import Corpus, Paper
import os
import re
import uuid
# MACRO for printing the 'print' statement values.
# 0 prints nothing in the console.
# 1 prints all print statements in the console.
DEBUG = 0
def _create_ayjid(aulast=None, auinit=None, date=None, jtitle=None, **kwargs):
"""
Convert aulast, auinit, and jtitle into the fuzzy identifier ayjid
Returns 'Unknown paper' if all id components are missing (None).
Parameters
----------
Kwargs : dict
A dictionary of keyword arguments.
aulast : string
Author surname.
auinit: string
Author initial(s).
date : string
Four-digit year.
jtitle : string
Title of the journal.
Returns
-------
ayj : string
Fuzzy identifier ayjid, or 'Unknown paper' if all id components are
missing (None).
"""
if aulast is None:
aulast = ''
elif isinstance(aulast, list):
aulast = aulast[0]
if auinit is None:
auinit = ''
elif isinstance(auinit, list):
auinit = auinit[0]
if date is None:
date = ''
if jtitle is None:
jtitle = ''
ayj = aulast + ' ' + auinit + ' ' + str(date) + ' ' + jtitle
if ayj == ' ':
ayj = 'Unknown paper'
return ayj.upper()
def _create_ainstid(aulast=None, auinit=None, addr1=None, \
addr2=None, country=None, **kwargs):
"""
This function is to create an fuzzy identifier ainstid.
Convert aulast, auinit, and jtitle into the fuzzy identifier ainstid.
Returns 'Unknown Institution' if all id components are missing (None).
Parameters
----------
Kwargs : dict
A dictionary of keyword arguments.
aulast : string
Author surname.
auinit : string
Author initial(s).
address1 : string
Address of the Institution.
address2 : string
Address of the Institution.
country : string
Country of affiliation
Returns
-------
ainstid : string
Fuzzy identifier ainstid, or 'Unknown Institution' if all id components
are missing (None).
"""
if aulast is None:
aulast = ''
elif isinstance(aulast, list):
aulast = aulast[0]
if auinit is None:
auinit = ''
elif isinstance(auinit, list):
auinit = auinit[0]
if addr1 is None:
addr1 = ''
if addr2 is None:
addr2 = ''
if country is None:
country = ''
ainstid = aulast + ' ' + auinit + ' ' + addr1 + ' ' + addr2 + ' ' + country
if ainstid == ' ':
ainstid = 'Unknown Institution'
return ainstid
# Web of Science functions
[docs]def parse(filepath):
"""
Parse Web of Science field-tagged data.
Parameters
----------
filepath : string
Filepath to the Web of Science plain text file.
Returns
-------
wos_list : list
A list of dictionaries each associated with a paper from the Web of
Science with keys from docs/fieldtags.txt as encountered in the file;
most values associated with keys are strings with special exceptions
defined by the list_keys and int_keys variables.
Raises
------
KeyError : Key value which needs to be converted to an 'int' is not present.
AttributeError :
IOError : File at filepath not found, not readable, or empty.
Examples
--------
.. code-block:: python
>>> import tethne.readers as rd
>>> wos_list = rd.wos.parse("/Path/to/data.txt")
Notes
-----
Unknown keys: RI, OI, Z9
"""
wos_list = []
paper_start_key = 'PT'
paper_end_key = 'ER'
stop_flag = 0
# Try to read filepath
line_list = []
try:
with open(filepath,'r') as f:
line_list = f.read().splitlines()
except IOError: # File does not exist, or couldn't be read.
raise IOError("File does not exist, or cannot be read.")
if len(line_list) is 0:
raise IOError("Unable to read filepath or filepath is empty.")
# Convert the data in the file to a usable list of dictionaries.
# Note: first two lines of file are not related to any paper therein.
last_field_tag = paper_start_key #initialize to something.
for line in line_list[2:]:
line = strip_non_ascii(line)
field_tag = line[:2]
if field_tag == ' ':
pass
if field_tag == paper_start_key:
# Then prepare for next paper.
wos_dict = _new_wos_dict()
if field_tag == paper_end_key:
# Then add paper to our list.
wos_list.append(wos_dict)
# Handle keys like AU,AF,CR that continue over many lines.
if field_tag == ' ':
field_tag = last_field_tag
# Add value for the key to the wos_dict: the rest of the line.
try:
if field_tag in ['AU', 'AF', 'CR', 'C1', 'CA']:
# These unique fields use the new line delimiter to distinguish
# their list elements below.
# The field C1 can be either in multiple lines or in a single
# line -- It is the address/institutions of the author.
wos_dict[field_tag] += '\n' + str(line[3:])
else:
wos_dict[field_tag] += ' ' + str(line[3:])
except (KeyError, TypeError, UnboundLocalError):
wos_dict[field_tag] = str(line[3:])
last_field_tag = field_tag
# End line loop.
# Define keys that should be lists instead of default string.
list_keys = ['AU', 'AF', 'DE', 'ID', 'CR', 'C1', 'CA']
delims = {'AU':'\n',
'AF':'\n',
'DE':';',
'ID':';',
'C1':'\n',
'CR':'\n',
'CA':'\n'}
# And convert the data at those keys into lists.
for wos_dict in wos_list:
for key in list_keys:
delim = delims[key]
try:
key_contents = wos_dict[key]
if delim != '\n':
wos_dict[key] = key_contents.split(delim)
else:
wos_dict[key] = key_contents.splitlines()
except KeyError:
# One of the keys to be converted to a list didn't exist.
pass
except AttributeError:
# Again a key didn't exist but it belonged to the wos
# data_struct set of keys; can't split a None.
pass
# Similarly convert some data from string to int.
int_keys = ['PY']
for wos_dict in wos_list:
for key in int_keys:
try:
wos_dict[key] = int(wos_dict[key])
except KeyError:
# One of the keys to be converted to an int didn't exist.
pass
except TypeError:
# Again a key didn't exist but it belonged to the wos
# data_struct set of keys; can't convert None to an int.
pass
return wos_list
def _parse_cr(ref):
"""
Supports the Web of Science reader by converting the strings found
at the CR field tag of a record into a minimum :class:`.Paper` instance.
Parameters
----------
ref : str
CR field tag data from a plain text Web of Science file.
Returns
-------
paper : :class:`.Paper`
A :class:`.Paper` instance.
Raises
------
IndexError
When input 'ref' has less number of tokens than necessary ones.
ValueError
Gets input with mismacthed inputtype. Ex: getting no numbers for a date
field.
Notes
-----
Needs a sophisticated name parser, would like to use an open source resource
for this.
If WoS is missing a field in the middle of the list there are NOT commas
indicating that; the following example does NOT occur:
Doe J, ,, Some Journal
instead
Doe J, Some Journal
This threatens the integrity of WoS data; should we address it?
Another threat: if WoS is unsure of the DOI number there will be multiple
DOI numbers in a list of form [doi1, doi2, ...], address this?
"""
paper = Paper()
#tokens of form: aulast auinit, date, jtitle, volume, spage, doi
tokens = ref.split(',')
try:
#FIXME: needs better name parser
# Checking for few parsers, in the meantime trying out few things.
name = tokens[0]
# Temp Solution for #62809724
pattern = re.compile(r'\[(.*?)\]')
match = pattern.search(name)
if match:
# remove the [] and make it a proper one.
name = name[match.start()+1:match.end()-1]
if DEBUG :
print 'stripped name: ', name
name_tokens = name.split(' ')
if len(name_tokens) < 2:
# name_tokens.append('None')
name_tokens.append(' ')
paper['aulast'] = [name_tokens[0]]
paper['auinit'] = [''.join(name_tokens[1:]).replace('.','')]
if DEBUG:
print "Final Meta Dicts", paper['aulast'], paper['auinit']
# Temp Solution for #62809724
if paper['auinit'] == 'None' or paper['aulast'] == 'None' :
raise ("The Cited References field is not in the expeceted format")
#strip initial characters based on the field (spaces, 'V', 'DOI')
paper['date'] = int(tokens[1][1:])
paper['jtitle'] = tokens[2][1:]
paper['volume'] = tokens[3][2:]
paper['spage'] = tokens[4][2:]
paper['doi'] = tokens[5][5:]
except IndexError as E: # ref did not have the full set of tokens
pass
except ValueError as E: # This occurs when the program expects a date
pass # but gets a string with no numbers. We leave
# the field incomplete because chances are the
# CR string is too sparse to use anyway.
ayjid = _create_ayjid(paper['aulast'], paper['auinit'],
paper['date'], paper['jtitle'])
paper['ayjid'] = ayjid
return paper
[docs]def convert(wos_data):
"""
Convert parsed field-tagged data to :class:`.Paper` instances.
Convert a dictionary or list of dictionaries with keys from the
Web of Science field tags into a :class:`.Paper` instance or list of
:class:`.Paper` instances, the standard for Tethne.
Each :class:`.Paper` is tagged with an accession id for this conversion.
Parameters
----------
wos_data : list
A list of dictionaries with keys from the WoS field tags.
Returns
-------
papers : list
A list of :class:`.Paper` instances.
Examples
--------
.. code-block:: python
>>> import tethne.readers as rd
>>> wos_list = rd.wos.parse("/Path/to/data.txt")
>>> papers = rd.wos.convert(wos_list)
Notes
-----
Need to handle author name anomolies (case, blank spaces, etc.) that may
make the same author appear to be two different authors in Networkx; this is
important for any graph with authors as nodes.
"""
accession = str(uuid.uuid4())
#create a Paper for each wos_dict and append to this list
papers = []
#handle dict inputs by converting to a 1-item list
if type(wos_data) is dict:
wos_data = [wos_data]
#print 'wos data \n' , wos_data
# Calling the validate function here, before even building papers list
# [62809724]
status = _validate(wos_data)
if not status:
#raise Error
pass
# Define the direct relationships between WoS fieldtags and Paper keys.
translator = _wos2paper_map()
# Perform the key convertions
for wos_dict in wos_data:
paper = Paper()
#direct translations
for key in translator.iterkeys():
paper[translator[key]] = wos_dict[key]
# Group authors ('CA') are treated as personal authors.
if 'CA' in wos_dict.keys():
try: wos_dict['AU'] += wos_dict['CA']
except TypeError: wos_dict['AU'] = wos_dict['CA']
try: wos_dict['AF'] += wos_dict['CA']
except KeyError: wos_dict['AF'] = wos_dict['CA']
# more complicated translations
# FIXME: not robust to all names, organziation authors, etc.
if wos_dict['AU'] is not None:
paper['aulast'], paper['auinit'] = _handle_authors(wos_dict)
#construct ayjid
ayjid = _create_ayjid(paper['aulast'], paper['auinit'],
paper['date'], paper['jtitle'])
paper['ayjid'] = ayjid
# Parse author-institution affiliations. #60216226, #57746858.
if wos_dict['C1'] is not None:
paper['institutions'] = _handle_author_institutions(wos_dict)
# Convert CR references into paper format
if wos_dict['CR'] is not None:
meta_cr_list = []
for ref in wos_dict['CR']:
meta_cr_list.append(_parse_cr(ref))
#print 'meta_cr_list' , meta_cr_list
paper['citations'] = meta_cr_list
paper['accession'] = accession
papers.append(paper)
# End wos_dict for loop.
return papers
def _handle_authors(wos_dict):
aulast_list = []
auinit_list = []
for name in wos_dict['AU']:
name_tokens = name.split(',')
aulast = name_tokens[0].upper().strip()
try:
# 1 for 'aulast, aufirst'
auinit = name_tokens[1][1:].upper().strip()
except IndexError:
# then no first initial character
# preserve parallel name lists with empty string
auinit = ''
aulast_list.append(aulast)
auinit_list.append(auinit)
return aulast_list, auinit_list
def _handle_author_institutions(wos_dict):
pattern = re.compile(r'\[(.*?)\]')
author_institutions = {}
for c1_str in wos_dict['C1']: # One C1 line for each institution.
match = pattern.search(c1_str)
if match: # Explicit author-institution mappings are provided.
# For example:
#
# [Lin, Bing-Sian; Lee, Chon-Lin] Natl Sun Yat Sen Univ, Dept
# Marine Environm & Engn, Kaohsiung 80424, Taiwan.
# [Brimblecombe, Peter] Univ E Anglia, Sch Environm Sci, Norwich NR4
# 7TJ, Norfolk, England.
# [Lee, Chon-Lin] Natl Sun Yat Sen Univ, Asia Pacific Ocean Res Ctr,
# Kuroshio Res Grp, Kaohsiung 80424, Taiwan.
# [Lee, Chon-Lin] Natl Sun Yat Sen Univ, Ctr Emerging Contaminants
# Res, Kaohsiung 80424, Taiwan.
# [Liu, James T.] Natl Sun Yat Sen Univ, Inst Marine Geol & Chem,
# Kaohsiung 80424, Taiwan.
authors = c1_str[match.start()+1:match.end()-1].split('; ')
institution = c1_str[match.end():].upper() \
.strip() \
.strip('.') \
.split(', ')
for author in authors:
# The A-I mapping (in data) uses the AF representation
# of author names. But we use the AU representation
# as our mapping key to ensure consistency with older
# datasets.
author_index = wos_dict['AF'].index(author)
author_au = wos_dict['AU'][author_index].upper() \
.replace(',','')
inst_name = ', '.join([institution[0], institution[-1].strip()])
# Use lists, so we can tally 'votes' for most likely
# institution.
try:
author_institutions[author_au].append(inst_name)
except KeyError:
author_institutions[author_au] = [inst_name]
else: # Author-institution mappings are not provided. We
# therefore map all authors to all institutions.
# For example:
#
# UN, Environm Programme, Nairobi, Kenya.
# Univ Haifa, Dept Geog, IL-31095 Haifa, Israel.
for author_au in wos_dict['AU']:
author_au = author_au.upper() \
.replace(',','')
institution = c1_str.upper() \
.strip() \
.strip('.') \
.split(',')
inst_name = ', '.join([institution[0], institution[-1].strip()])
# Use lists, so we can tally 'votes' for most likely
# institution.
try:
author_institutions[author_au].append(inst_name)
except KeyError:
author_institutions[author_au] = [inst_name]
# Convert values back to lists before returning.
return { k:list(v) for k,v in author_institutions.iteritems() }
[docs]def read(datapath):
"""
Yields a list of :class:`.Paper` instances from a Web of Science data file.
Parameters
----------
datapath : string
Filepath to the Web of Science field-tagged data file.
Returns
-------
papers : list
A list of :class:`.Paper` instances.
Examples
--------
.. code-block:: python
>>> import tethne.readers as rd
>>> papers = rd.wos.read("/Path/to/data.txt")
"""
# Added Try Except
try:
wl = parse(datapath)
papers = convert(wl)
except IOError:
raise IOError("Invalid path.")
return papers
# [#60462784]
[docs]def from_dir(path):
"""
Convenience function for generating a list of :class:`.Paper` from a
directory of Web of Science field-tagged data files.
Parameters
----------
path : string
Path to directory of field-tagged data files.
Returns
-------
papers : list
A list of :class:`.Paper` objects.
Raises
------
IOError
Invalid path.
Examples
--------
.. code-block:: python
>>> import tethne.readers as rd
>>> papers = rd.wos.from_dir("/Path/to/datadir")
"""
wos_list = []
try:
files = os.listdir(path)
except IOError:
raise IOError("Invalid path.")
for f in files:
if not f.startswith('.'): # Ignore hidden files.
try:
wos_list += parse(path + "/" + f)
except (IOError,UnboundLocalError): # Ignore files that don't
pass # contain WoS data.
papers = convert(wos_list)
return papers
[docs]def read_corpus(path):
"""
"""
papers = read(path)
return Corpus(papers, index_by='wosid')
[docs]def corpus_from_dir(path):
"""
Parameters
----------
path : string
Path to directory of field-tagged data files.
Returns
-------
papers : list
A list of :class:`.Paper` objects.
"""
papers = from_dir(path)
return Corpus(papers, index_by='wosid')
# [62809724]
def _validate(wos_data):
"""
Defines the fucntion to check the input data validation.
Returns
-------
bool - True or false
if the data is in expected format (True)
if the respective field is not in expected format (False)
Raises
------
ValueError - according to the severity of the issue,
whether the wrong format will affect the further processing.
"""
if DEBUG:
print wos_data
# Create a translator dict whose keys are the fields which needs to be
# validated from the input.
# Any new field which needs validation in the future
translator = _new_wos_dict()
# Now all these input fields needs to be validated as per requirements.
for wos_dict in wos_data:
#direct translations
for key in translator.iterkeys():
if DEBUG :
print wos_dict[key]
# Validate for 'CR' field
if wos_dict['CR'] is not None:
for cr in wos_dict['CR']:
# check if the CR field is populated correctly
pass
if wos_dict['C1'] is not None:
for cr in wos_dict['C1']:
# check if the C1 field is populated correctly
pass
status = 1
return status
def _new_query_dict():
"""
Declares only those keys of the :class:`.Paper`'s metadata that are
queryable through CrossRef.
"""
q_dict = {
'aulast':None,
'auinit':None,
'atitle':None,
'address':None,
'jtitle':None,
'volume':None,
'issue':None,
'spage':None,
'epage':None,
'date':None }
return q_dict
def _new_wos_dict():
"""
Defines the set of field tags that will try to be converted, and intializes
them to 'None'.
Returns
-------
wos_dict : dict
A wos_list dictionary with 'None' as default values for all keys.
"""
wos_dict = {
'DI':None,
'AU':None,
'C1':None,
'TI':None,
'SO':None,
'VL':None,
'IS':None,
'BP':None,
'EP':None,
'PY':None,
'UT':None,
'CR':None,
'AB':None }
return wos_dict
def _wos2paper_map():
"""
Defines the direct relationships between the wos_dict and :class:`.Paper`.
Returns
-------
translator : dict
A 'translator' dictionary.
"""
translator = {
'DI':'doi',
'TI':'atitle',
'SO':'jtitle',
'VL':'volume',
'IS':'issue',
'BP':'spage',
'EP':'epage',
'PY':'date',
'UT':'wosid',
'AB':'abstract' }
return translator
#Custom Error Defined
[docs]class DataError(Exception):
pass