# -*- coding: utf-8 -*-
"""This module contains functions that handle and summarize subgraphs of graphs"""
from __future__ import print_function
import itertools as itt
from collections import defaultdict
from operator import itemgetter
from pybel.constants import *
from pybel.struct.filters.edge_predicates import edge_has_annotation
from ..selection.group_nodes import group_nodes_by_annotation, group_nodes_by_annotation_filtered
from ..utils import calculate_tanimoto_set_distances, count_dict_values
__all__ = [
'count_subgraph_sizes',
'calculate_subgraph_edge_overlap',
'summarize_subgraph_edge_overlap',
'rank_subgraph_by_node_filter',
'summarize_subgraph_node_overlap',
]
[docs]def count_subgraph_sizes(graph, annotation='Subgraph'):
"""Counts the number of nodes in each subgraph induced by an anotation
:param pybel.BELGraph graph: A BEL graph
:param annotation: The annotation to group by and compare. Defaults to 'Subgraph'
:type annotation: str
:return: A dictionary from {annotation value: number of nodes}
:rtype: dict[str, int]
"""
return count_dict_values(group_nodes_by_annotation(graph, annotation))
[docs]def calculate_subgraph_edge_overlap(graph, annotation='Subgraph'):
"""Builds a dataframe to show the overlap between different subgraphs
Options:
1. Total number of edges overlap (intersection)
2. Percentage overlap (tanimoto similarity)
:param pybel.BELGraph graph: A BEL graph
:param annotation: The annotation to group by and compare. Defaults to 'Subgraph'
:type annotation: str
:return: {subgraph: set of edges}, {(subgraph 1, subgraph2): set of intersecting edges},
{(subgraph 1, subgraph2): set of unioned edges}, {(subgraph 1, subgraph2): tanimoto similarity},
"""
sg2edge = defaultdict(set)
for u, v, d in graph.edges_iter(data=True):
if not edge_has_annotation(d, annotation):
continue
sg2edge[d[ANNOTATIONS][annotation]].add((u, v))
subgraph_intersection = defaultdict(dict)
subgraph_union = defaultdict(dict)
result = defaultdict(dict)
for sg1, sg2 in itt.product(sg2edge, repeat=2):
subgraph_intersection[sg1][sg2] = sg2edge[sg1] & sg2edge[sg2]
subgraph_union[sg1][sg2] = sg2edge[sg1] | sg2edge[sg2]
result[sg1][sg2] = len(subgraph_intersection[sg1][sg2]) / len(subgraph_union[sg1][sg2])
return sg2edge, subgraph_intersection, subgraph_union, result
[docs]def summarize_subgraph_edge_overlap(graph, annotation='Subgraph'):
"""Returns a similarity matrix between all subgraphs (or other given annotation)
:param pybel.BELGraph graph: A BEL graph
:param annotation: The annotation to group by and compare. Defaults to :code:`"Subgraph"`
:type annotation: str
:return: A similarity matrix in a dict of dicts
:rtype: dict
"""
_, _, _, subgraph_overlap = calculate_subgraph_edge_overlap(graph, annotation)
return subgraph_overlap
[docs]def rank_subgraph_by_node_filter(graph, node_filters, annotation='Subgraph', reverse=True):
"""Ranks subgraphs by which have the most nodes matching an given filter
:param pybel.BELGraph graph: A BEL graph
:param node_filters: A predicate or list of predicates (graph, node) -> bool
:type node_filters: types.FunctionType or iter[types.FunctionType]
:param annotation:
:type annotation: str
:param reverse:
:type reverse: bool
:rtype: list
A use case for this function would be to identify which subgraphs contain the most differentially expressed
genes.
>>> from pybel import from_pickle
>>> from pybel.constants import *
>>> from pybel_tools.integration import overlay_type_data
>>> from pybel_tools.summary import rank_subgraph_by_node_filter
>>> import pandas as pd
>>> graph = from_pickle('~/dev/bms/aetionomy/alzheimers.gpickle')
>>> df = pd.read_csv('~/dev/bananas/data/alzheimers_dgxp.csv', columns=['Gene', 'log2fc'])
>>> data = {gene: log2fc for _, gene, log2fc in df.itertuples()}
>>> overlay_type_data(graph, data, 'log2fc', GENE, 'HGNC', impute=0)
>>> results = rank_subgraph_by_node_filter(graph, lambda g, n: 1.3 < abs(g.node[n]['log2fc']))
"""
r1 = group_nodes_by_annotation_filtered(graph, node_filters=node_filters, annotation=annotation)
r2 = count_dict_values(r1)
return sorted(r2.items(), key=itemgetter(1), reverse=reverse)
[docs]def summarize_subgraph_node_overlap(graph, node_filters=None, annotation='Subgraph'):
"""Calculates the subgraph similarity tanimoto similarity in nodes passing the given filter
Provides an alternate view on subgraph similarity, from a more node-centric view
"""
r1 = group_nodes_by_annotation_filtered(graph, node_filters=node_filters, annotation=annotation)
r2 = calculate_tanimoto_set_distances(r1)
return r2