Source code for annotations.tasks

"""
We should probably write some documentation.
"""

from __future__ import absolute_import

from django.contrib.auth.models import Group
from django.utils.safestring import SafeText
from django.contrib.contenttypes.models import ContentType

import requests
from bs4 import BeautifulSoup
from guardian.shortcuts import assign_perm
import uuid
import re
from itertools import groupby

import slate
from . import managers
from annotations.models import *
from annotations import quadriga

from celery import shared_task


[docs]def get_manager(name):
    print dir(managers), name
    try:
        return getattr(managers, name)
    except AttributeError:
        raise ValueError('No such manager.')


[docs]def tokenize(content, delimiter=u' '):
    """
    In order to annotate a text, we must first wrap "annotatable" tokens
    in <word></word> tags, with arbitrary IDs.

    Parameters
    ----------
    content : unicode
    delimiter : unicode
        Character or sequence by which to split and join tokens.

    Returns
    -------
    tokenizedContent : unicode
    """
    chunks = content.split(delimiter)
    pattern = u'<word id="{0}">{1}</word>'
    return delimiter.join([pattern.format(i, c) for i, c in enumerate(chunks)])


[docs]def retrieve(repository, resource):
    """
    Get the content of a resource.

    Parameters
    ----------
    repository : :class:`.Repository`
    resource : unicode or int
        Identifier by which the resource can be retrieved from ``repository``.

    Returns
    -------
    content : unicode
    """
    return repository.getContent(resource)


[docs]def scrape(url):
    """
    Retrieve text content from a website.

    Parameters
    ----------
    url : unicode
        Location of web resource.

    Returns
    -------
    textData : dict
        Metadata and text content retrieved from ``url``.
    """
    response = requests.get(uri, allow_redirects=True)

    # TODO : consider plugging in something like DiffBot.
    soup = BeautifulSoup(response.text, "html.parser")

    textData = {
        'title': soup.title.string,
        'content': response.text,
        'content-type': response.headers['content-type'],
    }
    return textData


[docs]def extract_text_file(uploaded_file):
    """
    Extract the text file, and return its content

    Parameters
    ----------
    uploaded_file : InMemoryUploadedFile
        The uploaded text file

    Returns
    -------
    Content of the text file as a string
    """
    if uploaded_file.content_type != 'text/plain':
        raise ValueError('uploaded_file file should be a plain text file')
    filecontent = ''
    for line in uploaded_file:
        filecontent += line + ' '
    return filecontent


[docs]def extract_pdf_file(uploaded_file):
    """
    Extract a PDF file and return its content

    Parameters
    ----------
    uploaded_file : InMemoryUploadedFile
        The uploaded PDF file

    Returns
    -------
    Content of the PDF file as a string
    """
    if uploaded_file.content_type != 'application/pdf':
        raise ValueError('uploaded_file file should be a PDF file')
    doc = slate.PDF(uploaded_file)
    filecontent = ''
    for content in doc:
        filecontent += content.decode('utf-8') + '\n\n'
    return filecontent


# TODO: refactor!! This signature stinks.
[docs]def save_text_instance(tokenized_content, text_title, date_created, is_public, user, uri=None):
    """
    This method creates and saves the text instance based on the parameters passed

    Parameters
    ----------
    tokenized_content : String
        The tokenized text
    text_title : String
        The title of the text instance
    date_created : Date
        The date to be associated with text instance
    is_public : Boolean
        Whether the text content is public or not
    user : User
        The user who saved the text content
    """
    if not uri:
        uri = 'http://vogonweb.net/' + str(uuid.uuid1())
    text = Text(tokenizedContent=tokenized_content,
            title=text_title,
            created=date_created,
            public=is_public,
            addedBy=user,
            uri=uri)
    text.save()
    assign_perm('annotations.view_text', user, text)
    if is_public:
        group = Group.objects.get_or_create(name='Public')[0]
        assign_perm('annotations.view_text', group, text)
    return text


@shared_task
def submit_relationsets_to_quadriga(relationsets, text, user, **kwargs):
    status, response = quadriga.submit_relationsets(relationsets, text, user, **kwargs)
    if status:
        accession = QuadrigaAccession.objects.create(**{
            'createdBy': user,
            'project_id': response.get('project_id'),
            'workspace_id': response.get('workspace_id'),
            'network_id': response.get('networkid')
        })

        for relationset in relationsets:
            relationset.submitted = True
            relationset.submittedOn = accession.created
            relationset.submittedWith = accession
            relationset.save()

            for relation in relationset.constituents.all():
                relation.submitted = True
                relation.submittedOn = accession.created
                relation.submittedWith = accession
                relation.save()

            for appellation in relationset.appellations():
                appellation.submitted = True
                appellation.submittedOn = accession.created
                appellation.submittedWith = accession
                appellation.save()


@shared_task
def accession_ready_relationsets():

    qs = RelationSet.objects.filter(submitted=False, pending=False)

    # Do not submit a relationset to Quadriga if the constituent interpretations
    #  involve concepts that are not resolved.
    all_rsets = [rs for rs in qs if rs.ready()]

    print 'processing %i relationsets' % len(all_rsets)
    project_grouper = lambda rs: getattr(rs.occursIn.partOf.first(), 'quadriga_id', -1)

    for project_id, project_group in groupby(sorted(all_rsets, key=project_grouper), key=project_grouper):
        print project_id
        for text_id, text_group in groupby(project_group, key=lambda rs: rs.occursIn.id):
            text = Text.objects.get(pk=text_id)
            for user_id, user_group in groupby(text_group, key=lambda rs: rs.createdBy.id):
                user = VogonUser.objects.get(pk=user_id)
                # We lose the iterator after the first pass, so we want a list here.
                rsets = []
                for rs in user_group:
                    rsets.append(rs)
                    rs.pending = True
                    rs.save()
                kwargs = {}
                if project_id:
                    kwargs.update({
                        'project_id': project_id,
                    })
                submit_relationsets_to_quadriga.delay(rsets, text, user, **kwargs)


[docs]def handle_file_upload(request, form):
    """
    Handle the uploaded file and route it to corresponding handlers

    Parameters
    ----------
    request : `django.http.requests.HttpRequest`
    form : `django.forms.Form`
        The form with uploaded content

    """
    uploaded_file = request.FILES['filetoupload']
    uri = form.cleaned_data['uri']
    text_title = form.cleaned_data['title']
    date_created = form.cleaned_data['datecreated']
    is_public = form.cleaned_data['ispublic']
    user = request.user
    file_content = None
    if uploaded_file.content_type == 'text/plain':
        file_content = extract_text_file(uploaded_file)
    elif uploaded_file.content_type == 'application/pdf':
        file_content = extract_pdf_file(uploaded_file)

    # Save the content if the above extractors extracted something
    if file_content != None:
        tokenized_content = tokenize(file_content)
        return save_text_instance(tokenized_content, text_title, date_created, is_public, user, uri)
VogonWeb

Navigation

Related Topics

Source code for annotations.tasks