Source code for pybel_tools.analysis.neurommsig.algorithm

# -*- coding: utf-8 -*-

"""An implementation of the NeuroMMSig mechanism enrichment algorithm [DomingoFernandez2017]_.

.. [DomingoFernandez2017] Domingo-Fernández, D., *et al* (2017). `Multimodal mechanistic signatures for
    neurodegenerative diseases (NeuroMMSig): A web server for mechanism enrichment
    <https://doi.org/10.1093/bioinformatics/btx399>`_. Bioinformatics, 33(22), 3679–3681.
"""

import itertools as itt
import logging
from collections import Counter

from pybel import Pipeline
from pybel.constants import GENE
from pybel.struct import (
    collapse_all_variants, collapse_to_genes, enrich_protein_and_rna_origins, get_nodes_by_function,
    get_subgraphs_by_annotation,
)
from ...utils import calculate_betweenness_centality

__all__ = [
    'neurommsig_graph_preprocessor',
    'get_neurommsig_scores_prestratified',
    'get_neurommsig_scores',
    'get_neurommsig_score',
]

log = logging.getLogger(__name__)

neurommsig_graph_preprocessor = Pipeline.from_functions([
    enrich_protein_and_rna_origins,
    collapse_to_genes,
    collapse_all_variants,
])


[docs]def get_neurommsig_scores_prestratified(it, genes, ora_weight=None, hub_weight=None, top=None, topology_weight=None):
    """Takes a graph stratification and runs neurommsig on each

    :param iter[tuple[str,pybel.BELGraph]] it: A pre-stratified set of graphs
    :param list[tuple] genes: A list of gene nodes
    :param Optional[float] ora_weight: The relative weight of the over-enrichment analysis score from
     :py:func:`neurommsig_gene_ora`. Defaults to 1.0.
    :param Optional[float] hub_weight: The relative weight of the hub analysis score from :py:func:`neurommsig_hubs`.
     Defaults to 1.0.
    :param Optional[float] top: The percentage of top genes to use as hubs. Defaults to 5% (0.05).
    :param Optional[float] topology_weight: The relative weight of the topolgical analysis core from
     :py:func:`neurommsig_topology`. Defaults to 1.0.
    :param bool preprocess: If true, preprocess the graph.
    :return: A dictionary from {annotation value: NeuroMMSig composite score}
    :rtype: Optional[dict[str, float]]

    Pre-processing steps:

    1. Infer the central dogma with :func:``
    2. Collapse all proteins, RNAs and miRNAs to genes with :func:``
    3. Collapse variants to genes with :func:``
    """
    rv = {}

    for annotation_value, subgraph in it:
        score = get_neurommsig_score(subgraph, genes, ora_weight=ora_weight, hub_weight=hub_weight, top=top,
                                     topology_weight=topology_weight)
        rv[annotation_value] = score

    return rv


[docs]def get_neurommsig_scores(graph, genes, annotation='Subgraph', ora_weight=None, hub_weight=None, top=None,
                          topology_weight=None, preprocess=False):
    """Preprocesses the graph, stratifies by the given annotation, then runs the NeuroMMSig algorithm on each.

    :param pybel.BELGraph graph: A BEL graph
    :param list[tuple] genes: A list of gene nodes
    :param str annotation: The annotation to use to stratify the graph to subgraphs
    :param Optional[float] ora_weight: The relative weight of the over-enrichment analysis score from
     :py:func:`neurommsig_gene_ora`. Defaults to 1.0.
    :param Optional[float] hub_weight: The relative weight of the hub analysis score from :py:func:`neurommsig_hubs`.
     Defaults to 1.0.
    :param Optional[float] top: The percentage of top genes to use as hubs. Defaults to 5% (0.05).
    :param Optional[float] topology_weight: The relative weight of the topolgical analysis core from
     :py:func:`neurommsig_topology`. Defaults to 1.0.
    :param bool preprocess: If true, preprocess the graph.
    :return: A dictionary from {annotation value: NeuroMMSig composite score}
    :rtype: Optional[dict[str, float]]

    Pre-processing steps:

    1. Infer the central dogma with :func:``
    2. Collapse all proteins, RNAs and miRNAs to genes with :func:``
    3. Collapse variants to genes with :func:``
    """
    if preprocess:
        graph = neurommsig_graph_preprocessor.run(graph)

    if not any(gene in graph for gene in genes):
        log.debug('no genes mapping to graph')
        return

    it = get_subgraphs_by_annotation(graph, annotation=annotation).items()

    return get_neurommsig_scores_prestratified(it, genes, ora_weight=ora_weight, hub_weight=hub_weight, top=top,
                                               topology_weight=topology_weight)


[docs]def get_neurommsig_score(graph, target_genes, ora_weight=None, hub_weight=None, top=None, topology_weight=None):
    """Calculates the composite NeuroMMSig Score for a given list of genes.

    :param pybel.BELGraph graph: A BEL graph
    :param list[tuple] target_genes: A list of gene nodes
    :param Optional[float] ora_weight: The relative weight of the over-enrichment analysis score from
     :py:func:`neurommsig_gene_ora`. Defaults to 1.0.
    :param Optional[float] hub_weight: The relative weight of the hub analysis score from :py:func:`neurommsig_hubs`.
     Defaults to 1.0.
    :param Optional[float] top: The percentage of top genes to use as hubs. Defaults to 5% (0.05).
    :param Optional[float] topology_weight: The relative weight of the topolgical analysis core from
     :py:func:`neurommsig_topology`. Defaults to 1.0.
    :return: The NeuroMMSig composite score
    :rtype: float
    """
    ora_weight = ora_weight or 1.0
    hub_weight = hub_weight or 1.0
    topology_weight = topology_weight or 1.0

    target_genes = list(target_genes)

    ora_score = neurommsig_gene_ora(graph, target_genes)
    hub_score = neurommsig_hubs(graph, target_genes, top=top)
    topology_score = neurommsig_topology(graph, target_genes)

    weighted_sum = ora_weight * ora_score + hub_weight * hub_score + topology_weight * topology_score
    total_weight = ora_weight + hub_weight + topology_weight
    return weighted_sum / total_weight


def neurommsig_gene_ora(graph, target_genes):
    """Calculates the percentage of target genes mappable to the graph
    
    Assume: graph central dogma inferred, collapsed to genes, collapsed variants 
    
    :param pybel.BELGraph graph: A BEL graph
    :param iter target_genes: An iterable of nodes
    :rtype: float
    """
    graph_genes = set(get_nodes_by_function(graph, GENE))
    return len(graph_genes.intersection(target_genes)) / len(graph_genes)


def neurommsig_hubs(graph, target_genes, top=None):
    """Calculates the percentage of target genes mappable to the graph
    
    Assume: graph central dogma inferred, collapsed to genes, collapsed variants, graph has more than 20 nodes
    
    :param pybel.BELGraph graph: A BEL graph
    :param iter[tuple] target_genes: A list of nodes
    :param Optional[float] top: The percentage of top genes to use as hubs. Defaults to 5% (0.05).
    :rtype: float
    """
    top = top or 0.05

    if graph.number_of_nodes() < 20:
        log.debug('Graph has less than 20 nodes')
        return 0.0

    graph_genes = set(get_nodes_by_function(graph, GENE))

    bc = Counter({
        node: betweenness_centrality
        for node, betweenness_centrality in calculate_betweenness_centality(graph).items()
        if node in graph_genes
    })

    # TODO consider continuous analog with weighting by percentile
    n = int(len(graph_genes) * top)

    if n < 1:
        n = 1

    unnormalized_sum = sum(
        node in target_genes
        for node in bc.most_common(n)
    )

    return unnormalized_sum / n


def neurommsig_topology(graph, nodes):
    """Calculates the node neighbor score for a given list of nodes.
    
    -  Doesn't consider self loops
    
    :param pybel.BELGraph graph: A BEL graph
    :param list[tuple] nodes: A list of nodes
    :rtype: float
    
    .. math::
        
         \frac{\sum_i^n N_G[i]}{n*(n-1)}
    """
    nodes = list(nodes)
    n = len(nodes)

    if n <= 1:
        # log.debug('')
        return 0.0

    unnormalized_sum = sum(
        u in graph[v]
        for u, v in itt.product(nodes, repeat=2)
        if v in graph and u != v
    )

    return unnormalized_sum / (n * (n - 1.0))