Source code for tethne.networks.authors

"""
Methods for generating networks in which authors are vertices.

.. autosummary::
   :nosignatures:

   author_cocitation
   author_coinstitution
   author_institution
   author_papers
   coauthors

"""

import networkx as nx
import tethne.utilities as util
from collections import defaultdict, Counter
from ..classes import Paper


import logging
logging.basicConfig(filename=None, format='%(asctime)-6s: %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('DEBUG')

[docs]def author_papers(papers, node_id='ayjid', paper_attribs=[], **kwargs):
    """
    Generate an author_papers network NetworkX directed graph.

    ==============     =========================================================
    Element            Description
    ==============     =========================================================
    Node               Two kinds of nodes with distinguishing "type" attributes:
                       * type = paper    - a paper in papers
                       * type = person   - a person in papers
                       Papers node attributes defined by paper_attribs.
    Edge               Directed, Author -> his/her Paper.
    ==============     =========================================================

    Parameters
    ----------
    papers : list
        A list of wos_objects.
    node_id : string
        A key from :class:`.Paper` used to identify the nodes.
    paper_attribs : list
        List of user-provided optional arguments apart from the provided
        positional arguments.

    Returns
    -------
    author_papers_graph : networkx.DiGraph
        A DiGraph 'author_papers_graph'.

    Raises
    ------
    KeyError : Raised when node_id is not present in Papers.

    """
    author_papers_graph = nx.DiGraph(type='author_papers')

    # Validate node_id.
    meta_dict = Paper()
    meta_keys = meta_dict.keys()
    meta_keys.remove('citations')
    if node_id not in meta_keys:
        raise KeyError('node_id' + node_id + ' cannot be used to identify' +
                       ' papers.')
    for entry in papers:
        # Define paper_attribute dictionary.
        paper_attrib_dict = util.subdict(entry, paper_attribs)
        paper_attrib_dict['type'] = 'paper'
        # Add paper node with attributes.
        author_papers_graph.add_node(entry[node_id], paper_attrib_dict)

        authors = util.concat_list(entry['aulast'], entry['auinit'], ' ')
        for i in xrange(len(authors)):
            # Add person node.
            author_papers_graph.add_node(authors[i], type="person")
            # Draw edges.
            author_papers_graph.add_edge(authors[i], entry[node_id],
                                   date=entry['date'])

    return author_papers_graph

[docs]def institutions(papers, threshold=1, edge_attrbs=['ayjid'], 
                 node_attribs=['authors'], geocode=False, **kwargs):
    """
    Generates an institutional network based on coauthorship.
    
    An edge is drawn between two institutional vertices whenever two authors,
    one at each respective institution, coauthor a :class:`.Paper`\.

    .. code-block:: python
    
       >>> I = nt.authors.institutions(papers)
       >>> I
       <networkx.classes.graph.Graph object at 0x10d94cfd0>
       

    ==============     =========================================================
    Element            Description
    ==============     =========================================================
    Node               Institution name and location.
    Edges              (a,b) in E(G) if coauthors R and S are affiliated with 
                       institutions a and b, respectively.
    ==============     =========================================================
    
    Parameters
    ----------
    papers : list
        A list of :class:`Paper` instances.
    threshold : int
        Minimum number of co-citations required for an edge. (default: 1)
    edge_attribs : list
        List of edge_attributes specifying which :class:`.Paper` keys (from the
        co-authored paper) to use as edge attributes. (default: ['ayjid'])
    node_attribs : list
        List of attributes to attach to author nodes. Presently limited to
        'institution'.
    geocode : bool
        If True, attempts to geocode institutional information for authors, and
        adds latitude, longitude, and precision attributes to each node.
    
    Returns
    -------
    G : networkx.Graph
        An institutional co-authorship network.    
    """

    G = nx.Graph(type='institutions')
    ca = coauthors(papers, threshold=threshold, geocode=geocode, **kwargs)
    
    edges = {}
    nodes = {   'latitude': {},
                'longitude': {},
                'precision': {},
                'authors': {}       }
    
    defaultEdge = { 'authors': list,
                    'ayjid': list,
                    'weight': int     }
    
    for edge in ca.edges(data=True):
        n = { 0: ca.node[edge[0]],
              1: ca.node[edge[1]] }
        
        # If there is no institutional information for an author, skip the edge.
        skip = False    
        try:
            inst = { 0:n[0]['institution'],
                     1:n[1]['institution'] }
        except KeyError:
            skip = True
        
        if not skip:
            if geocode:
                # Add geodata from most recent author at this institution.
                for i in (0,1):
                    nodes['latitude'][inst[i]] = n[i]['latitude']
                    nodes['longitude'][inst[i]] = n[i]['longitude']
                    nodes['precision'][inst[i]] = n[i]['precision']

            # try-except blocks to avoid 'key in dict.keys()' pattern.
            try:
                assert type(edges[(inst[0],inst[1])]) is dict
                key = (inst[0],inst[1])
            except (AssertionError, KeyError):
                try:
                    assert type(edges[(inst[1],inst[0])]) is dict
                    key = (inst[1],inst[0])
                except (AssertionError, KeyError):
                    # Instantiate types to avoid reference issues.
                    edges[(inst[0],inst[1])] = { k:v() for k,v 
                                                   in defaultEdge.iteritems() }
                    key = (inst[0],inst[1])

            # Add authors to institution nodes.
            for i in (0,1):
                try:
                    nodes['authors'][inst[i]].add(edge[i])
                except KeyError:
                    nodes['authors'][inst[i]] = set([edge[i]])

            edges[key]['authors'] += [ (edge[0], edge[1]) ]
            edges[key]['ayjid'] += edge[2]['ayjid']
            edges[key]['weight'] += edge[2]['weight']
    
    for edge, attributes in edges.iteritems():
        G.add_edge(edge[0], edge[1], **attributes)
    
    for key, values in nodes.iteritems():
        if key is 'authors':    # Since many writers don't support sets.
            values = { k:list(v) for k,v in values.iteritems() }
        nx.set_node_attributes(G, key, values)
    nx.set_node_attributes(G, 'size', { k:len(v) for k,v 
                                            in nodes['authors'].iteritems() })
        
    return G

[docs]def coauthors(papers, threshold=1, edge_attribs=['ayjid'], 
              node_attribs=['institution'], geocode=False,  **kwargs):
    """
    Generate a co-author network.

    As the name suggests, edges are drawn between two author-vertices in the
    case that those authors published a paper together. Co-authorship networks
    are popular models for studying patterns of collaboration in scientific
    communities.

    To generate a co-authorship network, use the
    :func:`.networks.authors.coauthors` method:
    
    Author institutional affiliation is included as a node attribute, if 
    possible.

    .. code-block:: python

       >>> CA = nt.authors.coauthors(papers)
       >>> CA
       <networkx.classes.graph.Graph object at 0x10d94cfd0>

    ==============     =========================================================
    Element            Description
    ==============     =========================================================
    Node               Author name.
    Edges              (a,b) in E(G) if a and b are coauthors on the same paper.
    ==============     =========================================================

    Parameters
    ----------
    papers : list
        A list of :class:`Paper` instances.
    threshold : int
        Minimum number of co-citations required for an edge. (default: 1)
    edge_attribs : list
        List of edge_attributes specifying which :class:`.Paper` keys (from the
        co-authored paper) to use as edge attributes. (default: ['ayjid'])
    node_attribs : list
        List of attributes to attach to author nodes. Presently limited to
        'institution'.
    geocode : bool
        If True, attempts to geocode institutional information for authors, and
        adds latitude, longitude, and precision attributes to each node.        

    Returns
    -------
    G : networkx.Graph
        A co-authorship network.

    """

    # TODO: Check whether papers contains :class:`.Paper` instances, and raise
    #  an exception if not.
    
    caller = logger.findCaller()
    logger.debug("{0}: start building coauthors graph".format(caller[1]))

    G = nx.Graph(type='coauthors')
    edge_att = {}
    #edge_listdict={}
    coauthor_dict = {}

    author_inst = {}
    
    for entry in papers:
        if entry['aulast'] is not None:
            # edge_att dictionary has the atributes given by user input
            #  for any edges that get added
            edge_att = util.subdict(entry, edge_attribs)
            # make a new list of aulast, auinit names
            full_names = util.concat_list(entry['aulast'],
                                          entry['auinit'],
                                          ' ')

            for a in xrange(len(full_names)):
                # Update global author-institution mappings.
                n = full_names[a]
                if entry['institutions'] is not None:
                    try:
                        inst = entry['institutions'][n]
                        try:
                            author_inst[n] += inst
                        except KeyError:
                            author_inst[n] = inst

                    except KeyError:
                        pass
                    
                for b in xrange(a+1, len(entry['aulast'])):
                    # (author_a,author_b) tuple is key for coauthor_dict.
                    authors = full_names[a], full_names[b]
                    authors_inv = full_names[b], full_names[a]

                    try:
                        assert type(coauthor_dict[authors]) is dict
                        key = authors
                    except (AssertionError, KeyError):
                        try:
                            assert type(coauthor_dict[authors_inv]) is dict
                            key = authors_inv
                        except (AssertionError, KeyError):
                            coauthor_dict[authors] = { k:[] for k 
                                                        in edge_att.keys() }
                            coauthor_dict[authors]['weight'] = 0
                            key = authors
                    for k, v in edge_att.iteritems():
                        coauthor_dict[key][k].append(v)
                    coauthor_dict[key]['weight'] += 1

    caller = logger.findCaller()
    logger.debug("{0}: done iterating over papers".format(caller[1]))
    
    # Add edges with specified edge attributes.
    for key, val in coauthor_dict.iteritems():
        if val['weight'] >= threshold:
            G.add_edge(key[0], key[1], attr_dict=val)

    caller = logger.findCaller()
    logger.debug("{0}: done adding edges".format(caller[1]))
        
    # Load GeoCoder here, to avoid excessive cache read/write operations.
    if geocode:
        from tethne.services.geocode import GoogleCoder
        gc = GoogleCoder()
        caller = logger.findCaller()
        logger.debug("{0}: loaded geocoder".format(caller[1]))
    
    if 'institution' in node_attribs:
        # Include institutional affiliations as node attributes, if possible.
        
        # Find most likely institution for each author. This won't work well if 
        #  the author only occurs once in the dataset and there was no explicit
        #  author-instituion mapping.
        
        caller = logger.findCaller()
        logger.debug("{0}: adding institutional information".format(caller[1]))
        
        for k,v in author_inst.iteritems():
            top_inst = max(Counter(v))
            try:    # If an author has no coauthors, they will not appear in G.
                G.node[k]['institution'] = top_inst
                
                # Optionally, include positional information, if possible.
                if geocode:

                    location = gc.code_this(top_inst)
            
                    if location is None:
                        location = gc.code_this(top_inst.split(',')[-1])
                        precision = 'country'
                    else:
                        precision = 'institution'
                    if location is not None:
                        G.node[k]['latitude'] = location.latitude
                        G.node[k]['longitude'] = location.longitude
                        G.node[k]['precision'] = precision
        
            except KeyError:
                pass
    
    caller = logger.findCaller()
    logger.debug("{0}: done building coauthors graph".format(caller[1]))
    
    return G

[docs]def author_institution(Papers, edge_attribs=[], **kwargs):
    """
    Generate a bi-partite graph connecting authors and their institutions.
    
    This may be slightly ambiguous for WoS data where there is no explicit
    author-institution mapping. Edge weights are the number of co-associations
    between an author and an institution, which should help resolve this
    ambiguity (the more data the better).

    ==============     =========================================================
    Element            Description
    ==============     =========================================================
    Node               Author name.
    Edge               (a,b) in E(G) if a and b are authors on the same paper.
    ==============     =========================================================

    Parameters
    ----------
    Papers : list
        A list of :class:`.Paper` instances.
    edge_attribs : list
        List of edge_attributes specifying which :class:`.Paper` keys (from the
        authored paper) to use as edge attributes. For example, the 'date' key
        in :class:`.Paper` .

    Returns
    -------
    author_institution_graph : networkx.MultiGraph
        A graph describing institutional affiliations of authors in the corpus.
    """

    author_institution_graph = nx.MultiGraph(type='author_institution')
    #The Field in Papers which corresponds to authors and affliated institutions
    # is "institutions"
    # { 'institutions' : { Authors:[institutions_list]}}
    for paper in Papers:
        if paper['institutions'] is not None:
            auth_inst = paper['institutions']
            edge_attrib_dict = util.subdict(paper, edge_attribs)
            authors = auth_inst.keys()
            for au in authors:
                #add node of type 'author'
                author_institution_graph.add_node(au, type='author')
                ins_list = Counter(auth_inst[au])
                for ins_str,count in ins_list.iteritems():
                    # Add node of type 'institutions'.
                    author_institution_graph.add_node(ins_str, \
                                                      type='institution')

                    author_institution_graph.add_edge(au, ins_str, \
                                              attr_dict=edge_attrib_dict, \
                                              weight=count )


    return author_institution_graph

[docs]def author_coinstitution(Papers, threshold=1, **kwargs):
    """
    Generate a co-institution graph, where edges indicate shared affiliation.

    Some bibliographic datasets, including data from the Web of Science,
    includes the institutional affiliations of authors. In a co-institution
    graph, two authors (vertices) have an edge between them if they share an
    institutional affiliation in the dataset. Note that data about institutional
    affiliations varies in the WoS database so this will yield more reliable
    results for more recent publications.

    To generate a co-institution network, use the
    :func:`.networks.authors.author_coinstitution` method:

    .. code-block:: python

       >>> ACI = nt.authors.author_coinstitution(papers)
       >>> ACI
       <networkx.classes.graph.Graph object at 0x106571190>

    ==============     =========================================================
    Element            Description
    ==============     =========================================================
    Node               Authors.
    Node Attribute     type (string). 'author' or 'institution'.
    Edges              (a, b) where a and b are affiliated with the same
                       institution.
    Edge attribute     overlap (int). number of shared institutions.
    ==============     =========================================================

    Parameters
    ----------
    Papers : list
        A list of wos_objects.
    threshold : int
        Minimum institutional overlap required for an edge.

    Returns
    -------
    coinstitution : NetworkX :class:`.graph`
        A coinstitution network.

    """
    coinstitution = nx.Graph(type='author_coinstitution')


    # The Field in Papers which corresponds to the affiliation is "institutions"
    #  { 'institutions' : { Authors:[institutions_list]}}
    author_institutions = {}  # keys: author names, values: list of institutions
    for paper in Papers:
        if paper['institutions'] is not None:
            for key, value in paper['institutions'].iteritems():
                try:
                    author_institutions[key] += value
                except KeyError:
                    author_institutions[key] = value
        authors = author_institutions.keys()
        for i in xrange(len(authors)):
            for j in xrange(len(authors)):
                if i != j:
                    # Compare 2 author dict elements.
                    overlap = (set(author_institutions[authors[i]])
                                &
                                set(author_institutions[authors[j]]))
                    if len(overlap) >= threshold:
                        coinstitution.add_edge(authors[i], authors[j], \
                                               overlap=len(overlap))
                    else :
                        pass
        #62809656
        attribs_dict = {}
        for node in coinstitution.nodes():
            attribs_dict[node] = 'author'
        nx.set_node_attributes( coinstitution, 'type', attribs_dict )


    return coinstitution

[docs]def author_cocitation(papers, threshold=1, **kwargs):
    """
    Generates an author co-citation network; edges indicate co-citation of
    authors' papers.

    Similar to :func:`.papers.cocitation`\, except that vertices are authors
    rather than papers. To generate an author co-citation network, use the
    :func:`.networks.authors.author_cocitation` method:

    .. code-block:: python

       >>> ACC = nt.authors.author_cocitation(papers)
       >>> ACC
       <networkx.classes.graph.Graph object at 0x106571190>

    ==============     =========================================================
    Element            Description
    ==============     =========================================================
    Nodes              Author name.
    Edge               (a, b) if a and b are referenced by the same paper in
                       papers
    Edge attribute     'weight', the number of papers that co-cite a and b.
    ==============     =========================================================

    Parameters
    ----------
    papers : list
        a list of :class:`.Paper` objects.
    threshold : int
        Minimum number of co-citations required to create an edge between
        authors.

    Returns
    -------
    cocitation : :class:`.networkx.Graph`
        A cocitation network.

    """

    author_cocitations = nx.Graph(type='author_cocitation')

    # We'll use tuples as keys. Values are the number of times each pair
    # of 2 authors is co-cited.

    cocitations = {}
    delim = ' '

    for paper in papers:
        # Some papers don't have citations.
        if paper['citations'] is not None:
            # n is the number of papers in the provided list of Papers.
            n = len(paper['citations'])
            found_authors = []  # To avoid extra incrementation of author pairs.
            if n > 1:   # No point in proceeding if there is only one citation.
                for i in xrange(0, n):

                    # al_i_str is the author i's last name.
                    # converting list to str
                    al_i_str = ''.join(map(str, \
                                            (paper['citations'][i]['aulast'])))

                    # ai_i_str is the author i's first name
                    # converting list to str

                    ai_i_str = \
                        ''.join(map(str,(paper['citations'][i]['auinit'])))

                    # Making it a tuple,that it becomes key for cocitations dict
                    author_i_str = al_i_str + delim + ai_i_str

                    # Start inner loop at i+1,\
                    # to avoid redundancy and self-loops.

                    for j in xrange(i+1, n):
                        # al_j_str is the last name of author j
                        al_j_str = ''.join(map(str, \
                                            (paper['citations'][j]['aulast'])))

                        # ai_j_str is the author j's first name
                        # converting list to str

                        ai_j_str = ''.join(map(str, \
                                           (paper['citations'][j]['auinit'])))

                        # Making it a tuple so that it becomes the key for
                        # cocitations dict
                        author_j_str = al_j_str + delim + ai_j_str

                        # 2 tuples which are going to be the keys of the dict.
                        authors_pair = (author_i_str.upper(), \
                                                author_j_str.upper())
                        authors_pair_inv = (author_j_str.upper(), \
                                                author_i_str.upper())

                        # Have these authors been co-cited before?
                        try:
                            # check if author pair is not already \
                            # in the list and
                            # if the pair and inverse are not same. This is done
                            # to avoid drawing edges between same authors(nodes)

                            if (authors_pair not in found_authors
                                    and (authors_pair != authors_pair_inv)):
                                cocitations[authors_pair] += 1
                                found_authors.append(authors_pair)

                        except KeyError:
                            try: # May have been entered in opposite order.
                                if (authors_pair_inv not in found_authors
                                    and (authors_pair != authors_pair_inv)):
                                    cocitations[authors_pair_inv] += 1
                                    found_authors.append(authors_pair_inv)
                                # Networkx will ignore add_node
                                # if those nodes are already present
                            except KeyError:
                                # First time these papers have been co-cited.
                                cocitations[authors_pair] = 1
                                found_authors.append(authors_pair)

    # Create the network.
    for key, val in cocitations.iteritems():
        # If the weight is greater or equal to the user I/P threshold
        if val >= threshold :
            # Add edge between the 2 co-cited authors
            author_cocitations.add_edge(key[0], key[1], weight=val)

    return author_cocitations