Source code for tethne.readers.scopus

"""
Reader for Scopus CSV data files.
"""

import csv
import re
import os
import uuid

from unidecode import unidecode

import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
    unicode = str
    xrange = range

from tethne import Paper, Corpus

[docs]def read_corpus(path):
    """

    """

    papers = read(path)
    return Corpus(papers, index_by='eid')

[docs]def corpus_from_dir(path):
    """

    Parameters
    ----------
    path : string
        Path to directory of Scopus CSV data files.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` objects.
    """

    papers = from_dir(path)
    return Corpus(papers, index_by='eid')

[docs]def from_dir(path):
    """
    Convenience function for generating a list of :class:`.Paper` from a
    directory of Scopus CSV data files.

    Parameters
    ----------
    path : string
        Path to directory of Scopus CSV data files.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` objects.

    Raises
    ------
    IOError
        Invalid path.

    Examples
    --------

    .. code-block:: python

       >>> from  tethne.readers as rd
       >>> papers = rd.scopus.from_dir("/Path/to/datadir")

    """

    papers = []

    try:
        files = os.listdir(path)
    except IOError:
        raise IOError("Invalid path.")

    for f in files:
        if not f.startswith('.') and f.endswith('csv'): # Ignore hidden files.
            try:
                papers += read('{0}/{1}'.format(path, f))
            except (IOError,UnboundLocalError): # Ignore files that don't
                pass                            #  contain Scopus data.

    return papers

[docs]def read(datapath, **kwargs):
    """
    Yields a list of :class:`.Paper` instances from a Scopus CSV data file.

    Parameters
    ----------
    datapath : string
        Filepath to the Web of Science field-tagged data file.

    Returns
    -------
    papers : list
        A list of :class:`.Paper` instances.

    Examples
    --------

    .. code-block:: python

       >>> import tethne.readers as rd
       >>> papers = rd.scopus.read("/Path/to/scopus.csv")

    """

    accession = unicode(uuid.uuid4())   # Accession ID.

    papers = kwargs.get('papers', [])   # Can provide an alternate container.

    rawdata = []
    with open(datapath, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            rawdata.append([ unidecode(r.decode('UTF-8')) for r in row ])

    headers = rawdata[0]    # First row is header.
    for datum in rawdata[1:]:
        p = Paper()

        rawdatum = { headers[i]:datum[i] for i in xrange(len(headers)) }
        p['aulast'], p['auinit'] = _handle_authors( rawdatum['Authors'] )
        p['date'] = int(rawdatum['Year'].strip())
        p['ayjid'] = _create_ayjid(
                        p['aulast'], p['auinit'], p['date'], p['jtitle'])

        p['institutions'] = _handle_affiliations(
                rawdatum['Authors with affiliations'], p['aulast'], p['auinit'])
        p['atitle'] = rawdatum['Title'].strip().upper()
        p['jtitle'] = rawdatum['Source title'].strip().upper()
        p['volume'] = rawdatum['Volume'].strip()
        p['issue'] = rawdatum['Issue'].strip()
        p['spage'] = rawdatum['Page start'].strip()
        p['epage'] = rawdatum['Page end'].strip()

        p['abstract'] = rawdatum['Abstract'].strip().upper()
        p['citations'] = _handle_references(rawdatum['References'])
        p['accession'] = accession

        # DOI and PMID are not always present, but must be unique. If they are
        #  not provided, should be set to None so that they can be handled
        #  appropriately downstream. EID should always be present.
        doi = rawdatum['DOI'].strip()
        if doi == '': doi = None
        p['doi'] = doi
        pmid = rawdatum['PubMed ID'].strip()
        if pmid == '': pmid = None
        p['pmid'] = pmid
        p['eid'] = rawdatum['EID'].strip()

        papers.append(p)

    return papers

def _handle_authors(authordata):
    aulast = []
    auinit = []

    for author in authordata.split(', '):
        try:
            a = [a_.strip().upper().replace('.','') for a_ in author.split()]
            aulast.append(' '.join(a[0:-1]))
            auinit.append(a[-1])
        except IndexError:  # Empty record; stray delimiter.
            pass

    return aulast, auinit

def _handle_affiliations(affiliationsdata, aulast, auinit):
    def matching_author(last, init, instcleaned):
        au_parts = [last, init]
        matching = 0
        for x in xrange(len(au_parts)):
            if au_parts[x] == instcleaned[x]:
                matching += 1
        if matching > 1:
            return True, matching
        return False, 0

    institutions = {}

    aff_split = affiliationsdata.split(';')
    A = len(aff_split)
    for i in xrange(A):
        aff = aff_split[i]
        a = aff.split(', ')

        try:
            cleaned = [ a_.strip().upper().replace('.','') for a_ in a ]

            # First work on the assumption that affiliations are in the same
            #  order as author names.
            aul = aulast[i]
            aui = auinit[i]
            match, overlap = matching_author(aul, aui, cleaned)
            if match:
                aname = ' '.join([aul, aui])

            # That might not always be true, so we'll try the other authors
            #  just in case.
            else:
                found = False
                for x in xrange(len(aulast)):
                    aul = aulast[x]
                    aui = auinit[x]

                    match, overlap = matching_author(aul, aui, cleaned)
                    if match:
                        aname = ' '.join([aul, aui])
                        found = True
                        break
                if not found:   # If the author can't be identified, discard.
                    break

            institution = a[match+1:] # The remainder is the institution name.
            l = len(institution)

            if l == 0:
                continue
            if l == 1:
                nation = institution[0].split()[-1].upper()
                inst = ' '.join(institution[0].split()[0:-1]).upper()
            elif l == 2:
                nation = institution[1].upper()
                inst = institution[0].upper()
            else:
                nation = institution[-1].upper()
                inst = ', '.join(institution[0:2]).upper()

            institutions[aname] = [', '.join([inst, nation])]
        except IndexError:  # Blank record part (stray delimiter).
            pass

    # inst_list should be a list of lists, ordered by aulast/auinit.
    authors = [ ' '.join(a) for a in zip(aulast, auinit) ]  # To use as keys.
    inst_list = []
    for au in authors:
        if au in institutions:      # Data available.
            inst_list.append(institutions[au])
        else:
            inst_list.append([])    # No data for that author.

    return inst_list

def _create_ayjid(aulast=None, auinit=None, date=None, jtitle=None, **kwargs):
    """
    Convert aulast, auinit, and jtitle into the fuzzy identifier ayjid
    Returns 'Unknown paper' if all id components are missing (None).

    Parameters
    ----------
    Kwargs : dict
        A dictionary of keyword arguments.
    aulast : string
        Author surname.
    auinit: string
        Author initial(s).
    date : string
        Four-digit year.
    jtitle : string
        Title of the journal.

    Returns
    -------
    ayj : string
        Fuzzy identifier ayjid, or 'Unknown paper' if all id components are
        missing (None).

    """
    if aulast is None or len(aulast) == 0:
        aulast = ''
    elif isinstance(aulast, list):
        aulast = aulast[0]

    if auinit is None or len(auinit) == 0:
        auinit = ''
    elif isinstance(auinit, list):
        auinit = auinit[0]

    if date is None:
        date = ''

    if jtitle is None:
        jtitle = ''

    ayj = aulast + ' ' + auinit + ' ' + unicode(date) + ' ' + jtitle

    if ayj == '   ':
        ayj = 'Unknown paper'

    return ayj.upper()

def _handle_references(referencesdata):
    """
    Use a series of RegEx patterns to (roughly) parse references into
    :class:`.Paper` objects.
    """

    references = [] # A list of Papers.

    for r in referencesdata.strip().split(';'): # Each cited reference.

        # Split at date first, e.g. ' ... (1995) ... '
        m = re.search('(.*)\((?P<date>[0-9]{4})\)(.*)', r)
        if m is not None:
            date = int(m.group('date'))

            pre_date = m.group(1).strip() # Authors, title (unless it's a book).
            post_date = m.group(3).strip() # Journal, volume, pages, etc.

            # Handle authors and title.
            #  Looks for one or more like 'Author, J.K.L., '
            m_ = re.findall('([a-zA-Z]*,\s[A-Z\.]*,[\s]*)', pre_date)

            if m_ is not None:  # Books will pass, but we won't use title.

                # Remove commas between surnames and initials.
                a_ref = ', '.join([ ' '.join(a.split(', ')) for a in m_ ])
                aulast, auinit = _handle_authors(a_ref)

                # Remainder is title.
                l = len(''.join(m_))
                title = pre_date[l:].upper().strip()

            else:   # Let the rest go for now.
                continue

            # Handle journal, volume, issue, pages.
            m_ = re.search('(.*)[p]{1,2}\.(.*)', post_date)

            if m_ is not None:  # Books will fail here.
                pages = m_.group(2).strip()
                pre_pp = m_.group(1)

                # Get start (and end) page number(s).
                if len(pages.split('-')) == 2:  # Both start and end.
                    spage = pages.split('-')[0]
                    epage = pages.split('-')[1]
                else:   # Only one page.
                    spage = pages
                    epage = pages

                # Get journal details.
                jtitle = pre_pp.split(', ')[0].strip().upper()
                issue_volume = pre_pp.split(', ')[1].strip()

                # Look for an issue number, in parentheses.
                j = re.search('([a-z0-9]*) \(([a-z0-9]*)\)', issue_volume)
                if j is not None:
                    volume = j.group(1)
                    issue = j.group(2)

                else:   # No issue number.
                    volume = issue_volume
                    issue = ''

            else:   # Probably a book.
                # TODO: Make this better.
                title = post_date.strip()   # Includes publisher, etc.
                jtitle = ''
                issue = ''
                volume = ''
                spage = ''
                epage = ''

        else:   # Let the rest go for now.
            continue

        # Populate the Paper object.
        p = Paper()
        p['atitle'] = title
        p['aulast'] = aulast
        p['auinit'] = auinit
        p['issue'] = issue
        p['date'] = date
        p['volume'] = volume
        p['jtitle'] = jtitle
        p['spage'] = spage
        p['epage'] = epage
        p['ayjid'] = _create_ayjid(
                            p['aulast'], p['auinit'], p['date'], p['jtitle'])

        references.append(p)

    return references