Need help? Have a feature request? Please check out the
tethne-users group
.
Source code for tethne.utilities
"""
Helper functions.
"""
import string
import copy
import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
unicode = str
from html.parser import HTMLParser # Python 3.x
xrange = range
else:
from HTMLParser import HTMLParser # Python 2.x
[docs]def is_number(value):
try:
int(value)
except ValueError:
try:
float(value)
except ValueError:
return False
return True
[docs]def number(value):
try:
return int(value)
except ValueError:
try:
return float(value)
except ValueError:
return value
def tokenize(s):
return s.lower()
[docs]class MLStripper(HTMLParser):
def __init__(self):
super(type(self), self).__init__()
self.reset()
self.fed = []
[docs] def feed(self, data):
"""
added this check as sometimes we are getting the data in integer format instead of string
"""
try:
self.rawdata = self.rawdata + data
except TypeError:
data = unicode(data)
self.rawdata = self.rawdata + data
self.goahead(0)
[docs]def argmin(iterable):
iterable = list(iterable)
i_min = -1
v_min = max(iterable)
for i, v in enumerate(iterable):
if v < v_min:
i_min = i
v_min = v
return i_min
[docs]def argmax(iterable):
i_max = -1
v_max = min(iterable)
for i, v in enumerate(iterable):
if v > v_max:
i_max = i
v_max = v
return i_max
[docs]def mean(iterable):
if len(iterable) > 0:
return float(sum(iterable))/len(iterable)
else:
return float('nan')
def _iterable(o):
if hasattr(o, '__iter__'):
return o
else:
return [o]
def _strip_punctuation(s):
"""
Removes all punctuation characters from a string.
"""
if type(s) is str and not PYTHON_3: # Bytestring (default in Python 2.x).
return s.translate(string.maketrans("",""), string.punctuation)
else: # Unicode string (default in Python 3.x).
translate_table = dict((ord(char), u'') for char in u'!"#%\'()*+,-./:;<=>?@[\]^_`{|}~')
return s.translate(translate_table)
def _strip_numbers(s):
"""
Removes all numbers from a string.
"""
return u''.join([c for c in s if not is_number(c)])
[docs]def normalize(s):
"""
Normalize a token.
* Convert to lower-case,
* Remove all punctuation,
* Remove all numbers.
"""
return _strip_numbers(_strip_punctuation(s.lower()))
[docs]def tokenize(passage):
"""
Convert a string into a list of normalized words.
"""
return [normalize(s) for s in passage.split(' ')]
def _space_sep(s):
if len(s) > 3:
return s
return ' '.join(list(s))
[docs]def swap(u,v):
"""
exchange the values of u and v
"""
return copy.deepcopy(v),copy.deepcopy(u)
[docs]def contains(l, f):
"""
Searches list l for a pattern specified in a lambda function f.
"""
for x in l:
if f(x):
return True
return False
[docs]def overlap(listA, listB):
"""
Return list of objects shared by listA, listB.
"""
if (listA is None) or (listB is None):
return []
else:
return list(set(listA) & set(listB))
[docs]def subdict(super_dict, keys):
"""
Returns a subset of the super_dict with the specified keys.
"""
sub_dict = {}
valid_keys = super_dict.keys()
for key in keys:
if key in valid_keys:
sub_dict[key] = super_dict[key]
return sub_dict
[docs]def attribs_to_string(attrib_dict, keys):
"""
A more specific version of the subdict utility aimed at handling
node and edge attribute dictionaries for NetworkX file formats such as
gexf (which does not allow attributes to have a list type) by making
them writable in those formats
"""
for key, value in attrib_dict.iteritems():
if (isinstance(value, list) or isinstance(value, dict) or
isinstance(value, tuple)):
attrib_dict[key] = value
return attrib_dict
[docs]def concat_list(listA, listB, delim=' '):
"""
Concatenate list elements pair-wise with the delim character
Returns the concatenated list
Raises index error if lists are not parallel
"""
# Lists must be of equal length.
if len(listA) != len(listB):
raise IndexError('Input lists are not parallel.')
# Concatenate lists.
listC = []
for i in xrange(len(listA)):
app = listA[i] + delim + listB[i]
listC.append(app)
return listC
[docs]def strip_non_ascii(s):
"""
Returns the string without non-ASCII characters.
Parameters
----------
string : string
A string that may contain non-ASCII characters.
Returns
-------
clean_string : string
A string that does not contain non-ASCII characters.
"""
stripped = (c for c in s if 0 < ord(c) < 127)
clean_string = u''.join(stripped)
return clean_string
[docs]def strip_punctuation(s):
exclude = set(string.punctuation)
return u''.join(ch for ch in s if ch not in exclude)
[docs]def dict_from_node(node, recursive=False):
"""
Converts ElementTree node to a dictionary.
Parameters
----------
node : ElementTree node
recursive : boolean
If recursive=False, the value of any field with children will be the
number of children.
Returns
-------
dict : nested dictionary.
Tags as keys and values as values. Sub-elements that occur multiple
times in an element are contained in a list.
"""
dict = {}
for snode in node:
if len(snode) > 0:
if recursive:
# Will drill down until len(snode) <= 0.
value = dict_from_node(snode, True)
else:
value = len(snode)
elif snode.text is not None:
value = snode.text
else:
value = u''
if snode.tag in dict.keys(): # If there are multiple subelements
# with the same tag, then the value
# of the element should be a list
# rather than a dict.
if type(dict[snode.tag]) is list: # If a list has already been
# started, just append to
# it.
dict[snode.tag].append(value)
else:
dict[snode.tag] = [ dict[snode.tag], value ]
else:
dict[snode.tag] = value # Default behavior.
return dict
[docs]class Dictionary:
"""
A two-way index for integer/string pairs.
"""
def __init__(self):
self.by_str = {}
self.by_int = {}
def __setitem__(self, key, value):
if type(key) == str:
self.by_str[key] = value
self.by_int[value] = key
if type(key) == int:
self.by_int[key] = value
self.by_str[value] = key
def __getitem__(self, key):
if type(key) == str:
return self.by_str[key]
if type(key) == int:
return self.by_int[key]