Source code for tethne.readers.wos

"""
Reader for Web of Science field-tagged bibliographic data.

Tethne parses Web of Science field-tagged data into a list of :class:`.Paper` 
objects. This is a two-step process: data are first parsed into a list of 
dictionaries with field-tags as keys, and then each dictionary is converted to a
:class:`.Paper` . :func:`.readers.wos.read` performs both steps in sequence.

One-step Parsing
````````````````

The method :func:`.readers.wos.read` performs both :func:`.readers.wos.parse` 
and :func:`.readers.wos.convert` . This is the preferred (simplest) approach in
most cases.

.. code-block:: python

   >>> papers = rd.wos.read("/Path/to/savedrecs.txt")
   >>> papers[0]
   <tethne.data.Paper instance at 0x101b575a8>

Alternatively, if you have many data files saved in the same directory, you can 
use :func:`.readers.wos.from_dir` :

.. code-block:: python

   >>> papers = rd.wos.parse_from_dir("/Path/to")

Two-step Parsing
````````````````

Use the two-step approach if you need to access fields not included in 
:class:`.Paper`\, or if you wish to perform some intermediate manipulation on
the raw parsed data.

First import the :mod:`.readers.wos` module:

.. code-block:: python

   >>> import tethne.readers as rd

Then parse the WoS data to a list of field-tagged dictionaries using 
:func:`.readers.wos.parse` :

.. code-block:: python

   >>> wos_list = rd.wos.parse("/Path/to/savedrecs.txt")
   >>> wos_list[0].keys()
   ['EM', '', 'CL', 'AB', 'WC', 'GA', 'DI', 'IS', 'DE', 'VL', 'CY', 'AU', 'JI', 
    'AF', 'CR', 'DT', 'TC', 'EP', 'CT', 'PG', 'PU', 'PI', 'RP', 'J9', 'PT', 
    'LA', 'UT', 'PY', 'ID', 'SI', 'PA', 'SO', 'Z9', 'PD', 'TI', 'SC', 'BP', 
    'C1', 'NR', 'RI', 'ER', 'SN']

Convert those field-tagged dictionaries to :class:`.Paper` objects using 
:func:`.readers.wos.convert` :

.. code-block:: python

   >>> papers = rd.wos.convert(wos_list)
   >>> papers[0]
   <tethne.data.Paper instance at 0x101b575a8>

Methods
```````

.. autosummary::

   convert
   from_dir
   parse
   read

"""

import xml.etree.ElementTree as ET
from ..utilities import *
from ..classes import Corpus, Paper
import os
import re
import uuid

# MACRO for printing the 'print' statement values.
# 0 prints nothing in the console.
# 1 prints all print statements in the console.
DEBUG = 0

def _create_ayjid(aulast=None, auinit=None, date=None, jtitle=None, **kwargs):
    """
    Convert aulast, auinit, and jtitle into the fuzzy identifier ayjid
    Returns 'Unknown paper' if all id components are missing (None).

    Parameters
    ----------
    Kwargs : dict
        A dictionary of keyword arguments.
    aulast : string
        Author surname.
    auinit: string
        Author initial(s).
    date : string
        Four-digit year.
    jtitle : string
        Title of the journal.

    Returns
    -------
    ayj : string
        Fuzzy identifier ayjid, or 'Unknown paper' if all id components are
        missing (None).

    """
    if aulast is None:
        aulast = ''
    elif isinstance(aulast, list):
        aulast = aulast[0]

    if auinit is None:
        auinit = ''
    elif isinstance(auinit, list):
        auinit = auinit[0]

    if date is None:
        date = ''

    if jtitle is None:
        jtitle = ''

    ayj = aulast + ' ' + auinit + ' ' + str(date) + ' ' + jtitle

    if ayj == '   ':
        ayj = 'Unknown paper'

    return ayj.upper()

def _create_ainstid(aulast=None, auinit=None, addr1=None, \
                   addr2=None, country=None, **kwargs):
    """
    This function is to create an fuzzy identifier ainstid.
    Convert aulast, auinit, and jtitle into the fuzzy identifier ainstid.
    Returns 'Unknown Institution' if all id components are missing (None).

    Parameters
    ----------
    Kwargs : dict
        A dictionary of keyword arguments.
    aulast : string
        Author surname.
    auinit : string
        Author initial(s).
    address1 : string
        Address of the Institution.
    address2 : string
        Address of the Institution.
    country : string
        Country of affiliation

    Returns
    -------
    ainstid : string
        Fuzzy identifier ainstid, or 'Unknown Institution' if all id components
        are missing (None).

    """
    if aulast is None:
        aulast = ''
    elif isinstance(aulast, list):
        aulast = aulast[0]

    if auinit is None:
        auinit = ''
    elif isinstance(auinit, list):
        auinit = auinit[0]

    if addr1 is None:
        addr1 = ''

    if  addr2 is None:
        addr2 = ''
    if  country is None:
        country = ''

    ainstid = aulast + ' ' + auinit + ' ' + addr1 + ' ' + addr2 + ' ' + country

    if ainstid == ' ':
        ainstid = 'Unknown Institution'

    return ainstid


# Web of Science functions
[docs]def parse(filepath):
    """
    Parse Web of Science field-tagged data.

    Parameters
    ----------
    filepath : string
        Filepath to the Web of Science plain text file.

    Returns
    -------
    wos_list : list
        A list of dictionaries each associated with a paper from the Web of
        Science with keys from docs/fieldtags.txt as encountered in the file;
        most values associated with keys are strings with special exceptions
        defined by the list_keys and int_keys variables.

    Raises
    ------
    KeyError : Key value which needs to be converted to an 'int' is not present.
    AttributeError :
    IOError : File at filepath not found, not readable, or empty.

    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> wos_list = rd.wos.parse("/Path/to/data.txt")

    Notes
    -----
    Unknown keys: RI, OI, Z9

    """
    wos_list = []

    paper_start_key = 'PT'
    paper_end_key = 'ER'
    stop_flag = 0
    

    # Try to read filepath
    line_list = []
    try:
        with open(filepath,'r') as f:
            line_list = f.read().splitlines()
    except IOError: # File does not exist, or couldn't be read.
        raise IOError("File does not exist, or cannot be read.")

    if len(line_list) is 0:
        raise IOError("Unable to read filepath or filepath is empty.")
    # Convert the data in the file to a usable list of dictionaries.
    # Note: first two lines of file are not related to any paper therein.
    last_field_tag = paper_start_key #initialize to something.
    for line in line_list[2:]:
        line = strip_non_ascii(line)
        field_tag = line[:2]
        
        if field_tag == ' ':
            pass
        
        if field_tag == paper_start_key:
            # Then prepare for next paper.
            wos_dict = _new_wos_dict()

        if field_tag == paper_end_key:
            # Then add paper to our list.
            wos_list.append(wos_dict)  

        # Handle keys like AU,AF,CR that continue over many lines.
        if field_tag == '  ':
            field_tag = last_field_tag

        # Add value for the key to the wos_dict: the rest of the line.
        try:
            if field_tag in ['AU', 'AF', 'CR', 'C1', 'CA']:
                # These unique fields use the new line delimiter to distinguish
                # their list elements below.
                # The field C1 can be either in multiple lines or in a single
                # line -- It is the address/institutions of the author.
                wos_dict[field_tag] += '\n' + str(line[3:])
            else:
                wos_dict[field_tag] += ' ' + str(line[3:])
        except (KeyError, TypeError, UnboundLocalError):
                wos_dict[field_tag] = str(line[3:])

        last_field_tag = field_tag
    # End line loop.

    # Define keys that should be lists instead of default string.
    list_keys = ['AU', 'AF', 'DE', 'ID', 'CR', 'C1', 'CA']
    delims = {'AU':'\n',
              'AF':'\n',
              'DE':';',
              'ID':';',
              'C1':'\n',
              'CR':'\n',
              'CA':'\n'}

    # And convert the data at those keys into lists.
    for wos_dict in wos_list:
        for key in list_keys:
            delim = delims[key]
            try:
                key_contents = wos_dict[key]
                if delim != '\n':
                    wos_dict[key] = key_contents.split(delim)
                else:
                    wos_dict[key] = key_contents.splitlines()
            except KeyError:
                # One of the keys to be converted to a list didn't exist.
                pass
            except AttributeError:
                # Again a key didn't exist but it belonged to the wos
                # data_struct set of keys; can't split a None.
                pass

    # Similarly convert some data from string to int.
    int_keys = ['PY']
    for wos_dict in wos_list:
        for key in int_keys:
            try:
                wos_dict[key] = int(wos_dict[key])
            except KeyError:
                # One of the keys to be converted to an int didn't exist.
                pass
            except TypeError:
                # Again a key didn't exist but it belonged to the wos
                # data_struct set of keys; can't convert None to an int.
                pass

    return wos_list

def _parse_cr(ref):

    """
    Supports the Web of Science reader by converting the strings found
    at the CR field tag of a record into a minimum :class:`.Paper` instance.

    Parameters
    ----------
    ref : str
        CR field tag data from a plain text Web of Science file.

    Returns
    -------
    paper : :class:`.Paper`
        A :class:`.Paper` instance.

    Raises
    ------
    IndexError
        When input 'ref' has less number of tokens than necessary ones.
    ValueError
        Gets input with mismacthed inputtype. Ex: getting no numbers for a date
        field.

    Notes
    -----
    Needs a sophisticated name parser, would like to use an open source resource
    for this.

    If WoS is missing a field in the middle of the list there are NOT commas
    indicating that; the following example does NOT occur:

        Doe J, ,, Some Journal

    instead

        Doe J, Some Journal

    This threatens the integrity of WoS data; should we address it?

    Another threat: if WoS is unsure of the DOI number there will be multiple
    DOI numbers in a list of form [doi1, doi2, ...], address this?

    """
    paper = Paper()
    #tokens of form: aulast auinit, date, jtitle, volume, spage, doi
    tokens = ref.split(',')
    try:
        #FIXME: needs better name parser
        # Checking for few parsers, in the meantime trying out few things.
        name = tokens[0]
        # Temp Solution for #62809724
        pattern = re.compile(r'\[(.*?)\]')
        match = pattern.search(name)
        if match:
            # remove the [] and make it a proper one.
            name = name[match.start()+1:match.end()-1]
            if DEBUG :
                print  'stripped name: ', name

        name_tokens = name.split(' ')
        if len(name_tokens) < 2:
            # name_tokens.append('None')
            name_tokens.append(' ')

        paper['aulast'] = [name_tokens[0]]
        paper['auinit'] = [''.join(name_tokens[1:]).replace('.','')]

        if DEBUG:
            print "Final Meta Dicts", paper['aulast'], paper['auinit']

        # Temp Solution for #62809724
        if paper['auinit'] == 'None' or paper['aulast'] == 'None' :
            raise ("The Cited References field is not in the expeceted format")

        #strip initial characters based on the field (spaces, 'V', 'DOI')
        paper['date'] = int(tokens[1][1:])
        paper['jtitle'] = tokens[2][1:]
        paper['volume'] = tokens[3][2:]
        paper['spage'] = tokens[4][2:]
        paper['doi'] = tokens[5][5:]
    except IndexError as E:     # ref did not have the full set of tokens
        pass
    except ValueError as E:     # This occurs when the program expects a date
        pass                    #  but gets a string with no numbers. We leave
                                #  the field incomplete because chances are the
                                #  CR string is too sparse to use anyway.

    ayjid = _create_ayjid(paper['aulast'], paper['auinit'],
                         paper['date'], paper['jtitle'])
    paper['ayjid'] = ayjid

    return paper

[docs]def convert(wos_data):
    """
    Convert parsed field-tagged data to :class:`.Paper` instances.

    Convert a dictionary or list of dictionaries with keys from the
    Web of Science field tags into a :class:`.Paper` instance or list of
    :class:`.Paper` instances, the standard for Tethne.
    
    Each :class:`.Paper` is tagged with an accession id for this conversion.

    Parameters
    ----------
    wos_data : list
        A list of dictionaries with keys from the WoS field tags.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` instances.

    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> wos_list = rd.wos.parse("/Path/to/data.txt")
       >>> papers = rd.wos.convert(wos_list)

    Notes
    -----
    Need to handle author name anomolies (case, blank spaces, etc.) that may
    make the same author appear to be two different authors in Networkx; this is
    important for any graph with authors as nodes.

    """
    
    accession = str(uuid.uuid4())
    
    #create a Paper for each wos_dict and append to this list
    papers = []

    #handle dict inputs by converting to a 1-item list
    if type(wos_data) is dict:
        wos_data = [wos_data]
        #print 'wos data \n' , wos_data


    # Calling the validate function here, before even building papers list
    # [62809724]
    status = _validate(wos_data)
    if not status:
        #raise Error
        pass

    # Define the direct relationships between WoS fieldtags and Paper keys.
    translator = _wos2paper_map()
    
    # Perform the key convertions
    for wos_dict in wos_data:
        paper = Paper()

        #direct translations
        for key in translator.iterkeys():
            paper[translator[key]] = wos_dict[key]
            
        # Group authors ('CA') are treated as personal authors.
        if 'CA' in wos_dict.keys():
            try: wos_dict['AU'] += wos_dict['CA']
            except TypeError: wos_dict['AU'] = wos_dict['CA']
            try: wos_dict['AF'] += wos_dict['CA']
            except KeyError: wos_dict['AF'] = wos_dict['CA']
    
        # more complicated translations
        # FIXME: not robust to all names, organziation authors, etc.
        if wos_dict['AU'] is not None:
            paper['aulast'], paper['auinit'] = _handle_authors(wos_dict)

        #construct ayjid
        ayjid = _create_ayjid(paper['aulast'], paper['auinit'],
                             paper['date'], paper['jtitle'])
        paper['ayjid'] = ayjid

        # Parse author-institution affiliations. #60216226, #57746858.
        if wos_dict['C1'] is not None:
            paper['institutions'] = _handle_author_institutions(wos_dict)

        # Convert CR references into paper format
        if wos_dict['CR'] is not None:
            meta_cr_list = []
            for ref in wos_dict['CR']:
                meta_cr_list.append(_parse_cr(ref))
                #print 'meta_cr_list' , meta_cr_list
            paper['citations'] = meta_cr_list

        paper['accession'] = accession
        
        papers.append(paper)
    # End wos_dict for loop.

    return papers
    
def _handle_authors(wos_dict):

    aulast_list = []
    auinit_list = []
    for name in wos_dict['AU']:
        name_tokens = name.split(',')
        aulast = name_tokens[0].upper().strip()
        try:
            # 1 for 'aulast, aufirst'
            auinit = name_tokens[1][1:].upper().strip()
        except IndexError:
            # then no first initial character
            # preserve parallel name lists with empty string
            auinit = ''
        aulast_list.append(aulast)
        auinit_list.append(auinit)
    
    return aulast_list, auinit_list

def _handle_author_institutions(wos_dict):
    pattern = re.compile(r'\[(.*?)\]')
    author_institutions = {}

    for c1_str in wos_dict['C1']:   # One C1 line for each institution.
    
        match = pattern.search(c1_str)
        if match:   # Explicit author-institution mappings are provided.
            # For example:
            #
            # [Lin, Bing-Sian; Lee, Chon-Lin] Natl Sun Yat Sen Univ, Dept 
            #   Marine Environm & Engn, Kaohsiung 80424, Taiwan.
            # [Brimblecombe, Peter] Univ E Anglia, Sch Environm Sci, Norwich NR4
            #   7TJ, Norfolk, England.
            # [Lee, Chon-Lin] Natl Sun Yat Sen Univ, Asia Pacific Ocean Res Ctr,
            #   Kuroshio Res Grp, Kaohsiung 80424, Taiwan.
            # [Lee, Chon-Lin] Natl Sun Yat Sen Univ, Ctr Emerging Contaminants 
            #   Res, Kaohsiung 80424, Taiwan.
            # [Liu, James T.] Natl Sun Yat Sen Univ, Inst Marine Geol & Chem, 
            #   Kaohsiung 80424, Taiwan.

            authors = c1_str[match.start()+1:match.end()-1].split('; ')
            institution = c1_str[match.end():].upper()      \
                                              .strip()      \
                                              .strip('.')   \
                                              .split(', ')
            
            for author in authors:
                # The A-I mapping (in data) uses the AF representation
                #  of author names. But we use the AU representation
                #  as our mapping key to ensure consistency with older
                #  datasets.
                author_index = wos_dict['AF'].index(author)
                author_au = wos_dict['AU'][author_index].upper()    \
                                                        .replace(',','')
                inst_name = ', '.join([institution[0], institution[-1].strip()])

                # Use lists, so we can tally 'votes' for most likely
                #  institution.
                try:
                    author_institutions[author_au].append(inst_name)
                except KeyError:
                    author_institutions[author_au] = [inst_name]

        else:   # Author-institution mappings are not provided. We
                #  therefore map all authors to all institutions.
            # For example:
            #
            # UN, Environm Programme, Nairobi, Kenya.
            # Univ Haifa, Dept Geog, IL-31095 Haifa, Israel.
            
            for author_au in wos_dict['AU']:
                author_au = author_au.upper()       \
                                     .replace(',','')
            
                institution = c1_str.upper()        \
                                    .strip()        \
                                    .strip('.')     \
                                    .split(',')
                inst_name = ', '.join([institution[0], institution[-1].strip()])

                # Use lists, so we can tally 'votes' for most likely
                #  institution.
                try:
                    author_institutions[author_au].append(inst_name)
                except KeyError:
                    author_institutions[author_au] = [inst_name]

    # Convert values back to lists before returning.
    return { k:list(v) for k,v in author_institutions.iteritems() }

[docs]def read(datapath):
    """
    Yields a list of :class:`.Paper` instances from a Web of Science data file.

    Parameters
    ----------
    datapath : string
        Filepath to the Web of Science field-tagged data file.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` instances.
        
    Examples
    --------
    
    .. code-block:: python

       >>> import tethne.readers as rd
       >>> papers = rd.wos.read("/Path/to/data.txt")

    """
    # Added Try Except 
    try:
        wl = parse(datapath)
        papers = convert(wl)
    except IOError:
        raise IOError("Invalid path.")

    return papers

# [#60462784]
[docs]def from_dir(path):
    """
    Convenience function for generating a list of :class:`.Paper` from a
    directory of Web of Science field-tagged data files.

    Parameters
    ----------
    path : string
        Path to directory of field-tagged data files.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` objects.

    Raises
    ------
    IOError
        Invalid path.
        
    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> papers = rd.wos.from_dir("/Path/to/datadir")        

    """

    wos_list = []

    try:
        files = os.listdir(path)
    except IOError:
        raise IOError("Invalid path.")
            
    for f in files:
        if not f.startswith('.'): # Ignore hidden files.
            try:
                wos_list += parse(path + "/" + f)
            except (IOError,UnboundLocalError): # Ignore files that don't 
                pass                            #  contain WoS data.
    papers = convert(wos_list)
    return papers

[docs]def read_corpus(path):
    """
    
    """

    papers = read(path)
    return Corpus(papers, index_by='wosid')

[docs]def corpus_from_dir(path):
    """
    
    Parameters
    ----------
    path : string
        Path to directory of field-tagged data files.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` objects.
    """

    papers = from_dir(path)
    return Corpus(papers, index_by='wosid')

# [62809724]
def _validate(wos_data):
    """
    Defines the fucntion to check the input data validation.

    Returns
    -------
    bool - True or false
    if the data is in expected format (True)
    if the respective field is not in expected format (False)

    Raises
    ------
    ValueError - according to the severity of the issue,
    whether the wrong format will affect the further processing.
    """
    if DEBUG:
        print wos_data


    # Create a translator dict whose keys are the fields which needs to be
    # validated from the input.
    # Any new field which needs validation in the future
    translator = _new_wos_dict()

    # Now all these input fields needs to be validated as per requirements.

    for wos_dict in wos_data:
        #direct translations
        for key in translator.iterkeys():
            if DEBUG :
                print wos_dict[key]
            # Validate for 'CR' field
            if wos_dict['CR'] is not None:
                for cr in wos_dict['CR']:
                    # check if the CR field is populated correctly
                    pass
            if wos_dict['C1'] is not None:
                for cr in wos_dict['C1']:
                    # check if the C1 field is populated correctly
                    pass

    status = 1
    return status

def _new_query_dict():
    """
    Declares only those keys of the :class:`.Paper`'s metadata that are
    queryable through CrossRef.
    """
    q_dict = {
                'aulast':None,
                'auinit':None,
                'atitle':None,
                'address':None,
                'jtitle':None,
                'volume':None,
                'issue':None,
                'spage':None,
                'epage':None,
                'date':None     }

    return q_dict


def _new_wos_dict():
    """
    Defines the set of field tags that will try to be converted, and intializes
    them to 'None'.

    Returns
    -------
    wos_dict : dict
        A wos_list dictionary with 'None' as default values for all keys.

    """
    wos_dict = {
                    'DI':None,
                    'AU':None,
                    'C1':None,
                    'TI':None,
                    'SO':None,
                    'VL':None,
                    'IS':None,
                    'BP':None,
                    'EP':None,
                    'PY':None,
                    'UT':None,
                    'CR':None,
                    'AB':None   }

    return wos_dict

def _wos2paper_map():
    """
    Defines the direct relationships between the wos_dict and :class:`.Paper`.

    Returns
    -------
    translator : dict
        A 'translator' dictionary.

    """
    translator = {
                    'DI':'doi',
                    'TI':'atitle',
                    'SO':'jtitle',
                    'VL':'volume',
                    'IS':'issue',
                    'BP':'spage',
                    'EP':'epage',
                    'PY':'date',
                    'UT':'wosid',
                    'AB':'abstract'    }

    return translator

#Custom Error Defined
[docs]class DataError(Exception):
    pass