Source code for pybel_tools.document_utils.document_utils

# -*- coding: utf-8 -*-

"""Utilities to merge multiple BEL documents on the same topic"""

import logging
from typing import Iterable, Mapping, Optional, Set, TextIO, Union
from xml.etree import ElementTree

import requests

from bel_resources import make_knowledge_header
from ..constants import abstract_url_fmt, title_url_fmt

__all__ = [
    'write_boilerplate',
]

log = logging.getLogger(__name__)


def make_pubmed_abstract_group(pmids: Iterable[Union[str, int]]) -> Iterable[str]:
    """Build a skeleton for the citations' statements.
    
    :param pmids: A list of PubMed identifiers
    :return: An iterator over the lines of the citation section
    """
    for pmid in set(pmids):
        yield ''

        res = requests.get(title_url_fmt.format(pmid))
        title = res.content.decode('utf-8').strip()

        yield 'SET Citation = {{"{}", "{}"}}'.format(title, pmid)

        res = requests.get(abstract_url_fmt.format(pmid))
        abstract = res.content.decode('utf-8').strip()

        yield 'SET Evidence = "{}"'.format(abstract)
        yield '\nUNSET Evidence\nUNSET Citation'


def _sanitize(s):
    if s is None:
        return None
    return s.strip().replace('\n', '')


#: Allows for querying the Entrez Gene Summary utility by formatting with an entrez id or list of comma seperated ids
PUBMED_GENE_QUERY_URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&id={}'


def get_entrez_gene_data(entrez_ids: Iterable[Union[str, int]]):
    """Get gene info from Entrez."""
    url = PUBMED_GENE_QUERY_URL.format(','.join(str(x).strip() for x in entrez_ids))
    response = requests.get(url)
    tree = ElementTree.fromstring(response.content)

    return {
        element.attrib['uid']: {
            'summary': _sanitize(element.find('Summary').text),
            'description': element.find('Description').text
        }
        for element in tree.findall('./DocumentSummarySet/DocumentSummary')
    }


def make_pubmed_gene_group(entrez_ids: Iterable[Union[str, int]]) -> Iterable[str]:
    """Builds a skeleton for gene summaries

    :param entrez_ids: A list of Entrez Gene identifiers to query the PubMed service
    :return: An iterator over statement lines for NCBI Entrez Gene summaries
    """
    url = PUBMED_GENE_QUERY_URL.format(','.join(str(x).strip() for x in entrez_ids))
    response = requests.get(url)
    tree = ElementTree.fromstring(response.content)

    for x in tree.findall('./DocumentSummarySet/DocumentSummary'):
        yield '\n# {}'.format(x.find('Description').text)
        yield 'SET Citation = {{"Other", "PubMed Gene", "{}"}}'.format(x.attrib['uid'])
        yield 'SET Evidence = "{}"'.format(x.find('Summary').text.strip().replace('\n', ''))
        yield '\nUNSET Evidence\nUNSET Citation'


[docs]def write_boilerplate(name: str, version: Optional[str] = None, description: Optional[str] = None, authors: Optional[str] = None, contact: Optional[str] = None, copyright: Optional[str] = None, licenses: Optional[str] = None, disclaimer: Optional[str] = None, namespace_url: Optional[Mapping[str, str]] = None, namespace_patterns: Optional[Mapping[str, str]] = None, annotation_url: Optional[Mapping[str, str]] = None, annotation_patterns: Optional[Mapping[str, str]] = None, annotation_list: Optional[Mapping[str, Set[str]]] = None, pmids: Optional[Iterable[Union[str, int]]] = None, entrez_ids: Optional[Iterable[Union[str, int]]] = None, file: Optional[TextIO] = None, ) -> None: """Write a boilerplate BEL document, with standard document metadata, definitions. :param name: The unique name for this BEL document :param contact: The email address of the maintainer :param description: A description of the contents of this document :param authors: The authors of this document :param version: The version. Defaults to current date in format ``YYYYMMDD``. :param copyright: Copyright information about this document :param licenses: The license applied to this document :param disclaimer: The disclaimer for this document :param namespace_url: an optional dictionary of {str name: str URL} of namespaces :param namespace_patterns: An optional dictionary of {str name: str regex} namespaces :param annotation_url: An optional dictionary of {str name: str URL} of annotations :param annotation_patterns: An optional dictionary of {str name: str regex} of regex annotations :param annotation_list: An optional dictionary of {str name: set of names} of list annotations :param pmids: A list of PubMed identifiers to auto-populate with citation and abstract :param entrez_ids: A list of Entrez identifiers to autopopulate the gene summary as evidence :param file: A writable file or file-like. If None, defaults to :data:`sys.stdout` """ lines = make_knowledge_header( name=name, version=version or '1.0.0', description=description, authors=authors, contact=contact, copyright=copyright, licenses=licenses, disclaimer=disclaimer, namespace_url=namespace_url, namespace_patterns=namespace_patterns, annotation_url=annotation_url, annotation_patterns=annotation_patterns, annotation_list=annotation_list, ) for line in lines: print(line, file=file) if pmids is not None: for line in make_pubmed_abstract_group(pmids): print(line, file=file) if entrez_ids is not None: for line in make_pubmed_gene_group(entrez_ids): print(line, file=file)