Source code for pybel_tools.summary.subgraph_summary

# -*- coding: utf-8 -*-

"""This module contains functions that handle and summarize subgraphs of graphs"""

from __future__ import print_function

import itertools as itt
from collections import defaultdict
from operator import itemgetter

from pybel.constants import *
from pybel.struct.filters.edge_predicates import edge_has_annotation
from ..selection.group_nodes import group_nodes_by_annotation, group_nodes_by_annotation_filtered
from ..utils import calculate_tanimoto_set_distances, count_dict_values

__all__ = [
    'count_subgraph_sizes',
    'calculate_subgraph_edge_overlap',
    'summarize_subgraph_edge_overlap',
    'rank_subgraph_by_node_filter',
    'summarize_subgraph_node_overlap',
]


[docs]def count_subgraph_sizes(graph, annotation='Subgraph'):
    """Counts the number of nodes in each subgraph induced by an anotation

    :param pybel.BELGraph graph: A BEL graph
    :param annotation: The annotation to group by and compare. Defaults to 'Subgraph'
    :type annotation: str
    :return: A dictionary from {annotation value: number of nodes}
    :rtype: dict[str, int]
    """
    return count_dict_values(group_nodes_by_annotation(graph, annotation))


[docs]def calculate_subgraph_edge_overlap(graph, annotation='Subgraph'):
    """Builds a dataframe to show the overlap between different subgraphs

    Options:
    1. Total number of edges overlap (intersection)
    2. Percentage overlap (tanimoto similarity)


    :param pybel.BELGraph graph: A BEL graph
    :param annotation: The annotation to group by and compare. Defaults to 'Subgraph'
    :type annotation: str
    :return: {subgraph: set of edges}, {(subgraph 1, subgraph2): set of intersecting edges},
            {(subgraph 1, subgraph2): set of unioned edges}, {(subgraph 1, subgraph2): tanimoto similarity},
    """

    sg2edge = defaultdict(set)

    for u, v, d in graph.edges_iter(data=True):
        if not edge_has_annotation(d, annotation):
            continue
        sg2edge[d[ANNOTATIONS][annotation]].add((u, v))

    subgraph_intersection = defaultdict(dict)
    subgraph_union = defaultdict(dict)
    result = defaultdict(dict)

    for sg1, sg2 in itt.product(sg2edge, repeat=2):
        subgraph_intersection[sg1][sg2] = sg2edge[sg1] & sg2edge[sg2]
        subgraph_union[sg1][sg2] = sg2edge[sg1] | sg2edge[sg2]
        result[sg1][sg2] = len(subgraph_intersection[sg1][sg2]) / len(subgraph_union[sg1][sg2])

    return sg2edge, subgraph_intersection, subgraph_union, result


[docs]def summarize_subgraph_edge_overlap(graph, annotation='Subgraph'):
    """Returns a similarity matrix between all subgraphs (or other given annotation)

    :param pybel.BELGraph graph: A BEL graph
    :param annotation: The annotation to group by and compare. Defaults to :code:`"Subgraph"`
    :type annotation: str
    :return: A similarity matrix in a dict of dicts
    :rtype: dict
    """
    _, _, _, subgraph_overlap = calculate_subgraph_edge_overlap(graph, annotation)
    return subgraph_overlap


[docs]def rank_subgraph_by_node_filter(graph, node_filters, annotation='Subgraph', reverse=True):
    """Ranks subgraphs by which have the most nodes matching an given filter

    :param pybel.BELGraph graph: A BEL graph
    :param node_filters: A predicate or list of predicates (graph, node) -> bool
    :type node_filters: types.FunctionType or iter[types.FunctionType]
    :param annotation:
    :type annotation: str
    :param reverse:
    :type reverse: bool
    :rtype: list

    A use case for this function would be to identify which subgraphs contain the most differentially expressed
    genes.

    >>> from pybel import from_pickle
    >>> from pybel.constants import *
    >>> from pybel_tools.integration import overlay_type_data
    >>> from pybel_tools.summary import rank_subgraph_by_node_filter
    >>> import pandas as pd
    >>> graph = from_pickle('~/dev/bms/aetionomy/alzheimers.gpickle')
    >>> df = pd.read_csv('~/dev/bananas/data/alzheimers_dgxp.csv', columns=['Gene', 'log2fc'])
    >>> data = {gene: log2fc for _, gene, log2fc in df.itertuples()}
    >>> overlay_type_data(graph, data, 'log2fc', GENE, 'HGNC', impute=0)
    >>> results = rank_subgraph_by_node_filter(graph, lambda g, n: 1.3 < abs(g.node[n]['log2fc']))
    """
    r1 = group_nodes_by_annotation_filtered(graph, node_filters=node_filters, annotation=annotation)
    r2 = count_dict_values(r1)
    return sorted(r2.items(), key=itemgetter(1), reverse=reverse)


[docs]def summarize_subgraph_node_overlap(graph, node_filters=None, annotation='Subgraph'):
    """Calculates the subgraph similarity tanimoto similarity in nodes passing the given filter

    Provides an alternate view on subgraph similarity, from a more node-centric view
    """
    r1 = group_nodes_by_annotation_filtered(graph, node_filters=node_filters, annotation=annotation)
    r2 = calculate_tanimoto_set_distances(r1)
    return r2