Source code for thema.probe.telescope

# File: probe/telescope.py
# Last Update: 05/15/24
# Updated by: JW

import importlib
import pickle

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns

pio.renderers.default = "browser"

from .. import config
from .data_utils import get_nearestTarget, sunset_dict
from .visual_utils import (
    _column_color_mapping,
    _group_color_mapping,
    _match_column_order,
    _normalize_df,
    _reduce_colorOpacity,
)



[docs]
class Telescope:
    """
    Telescope Class - to view star objects
        a suite to meet all your visualization needs

    Members
    ------
    pos: n-dimensional array
        node positioning data for graphs and components

    Functions
    ---------
    makeGraph()
        Visualize a graph!
    makeHeatmap()
        Visualize a breakdown of your connected components as a heatmap!
    makeSankey()
        Create a sankey diagram from a custom score_function
    makePathGraph()
        Creates a shortest-path graph for a single component, based based on a custom definition of target nodes

    Example
    --------
    >>> star_fp = '<PATH TO FILE>/jmap_clustererHDBSCANmin_cluster_size10_minIntersection-1_nCubes10_percOverlap0.6_id3_3.pkl'
    >>> telscope_instance = Telescope(star_fp)
    """

    def __init__(self, star_file):
        """
        Constructs a Telescope Instance

        Parameters
        ----------
        star_file : str
            filepath to a pickled star object

        """
        with open(star_file, "rb") as f:
            star_name = type(pickle.load(f)).__name__
        obv_configName = config.star_to_observatory[star_name]
        cfg = getattr(config, obv_configName)
        module = importlib.import_module(cfg.module)
        Observatory = module.initialize()
        self.observatory = Observatory(star_file)

        self._pos = nx.spring_layout(
            self.observatory.star.starGraph.graph, k=0.12, seed=6
        )

    @property
    def pos(self):
        """
        Get the position of the telescope.

        Returns
        -------
        numpy.ndarray
            The position of the telescope.

        Notes
        -----
        This member variable ensures that graph layouts are held constant when viewing
        graphs, groups/components, and path graphs. It is updated when updating seed and k
        in the `makeGraph()` and `makePathGraph()` functions.
        """
        return self._pos

    @pos.setter
    def pos(self, positions):
        """
        Set the positions to support resetting and storing positions across
        make<VISUAL>() functions.

        Parameters
        ----------
        positions : list or array-like
            The positions to be set.

        Returns
        -------
        None
        """
        self._pos = positions


[docs]
    def makeGraph(
        self,
        group_number: int = None,
        k: float = None,
        seed: int = None,
        col: str = None,
        aggregation_func=None,
        hideLegend: bool = False,
        node_size_multiple: int = 10,
    ):
        """
        Visualize a graph!

        Parameters
        --------
        group_number : int
            graph connected component number to subset the visualization to
            For example, just show component 1 and not the entire graph

        k : float, default None
            value from 0-1, determines optimal distance between nodes
            setting nx.spring_layout positions

        seed : int, default None
            Random state for deterministic node layouts, defaulted so graph representations are reproducable
            setting nx.spring_layout positions

        col : str
            Column to color nodes by - from the raw data

        aggregation_func : np.<function>, defaults to calculating mean
            method by which to aggregate values within a node for coloring
                - color node by the median value or by the sum of values for example
            supports all numpy aggregation functions such as np.mean, np.median, np.sum, etc

        hideLegend : bool, default False
            toggle the graph/component's legend on or off

        node_size_multiple : int, 10
            change the node sizing

        ╭────────────────────────────────╮
        │   NODE SIZING OPTIONS -- WIP   |
        ╰────────────────────────────────╯

        Example
        ------
        Visualize connected component #3 with nodes colored by the sum of total pollution of coal plants in the node
        (example using a dataset on coal plant impacts)
        >>> tel = Telescope(star_filePath)
        >>> tel.makeGraph(group_number=3, col="Total Pollution", aggregation_func=np.sum)

        """
        assert aggregation_func is None or callable(
            aggregation_func
        ), "aggregation_func must be a function or None"

        if col is None and aggregation_func is not None:
            raise KeyError(
                "Cannot use a node-color aggregation function if coloring nodes by GROUP (CONNECTED COMPONENT)"
            )

        if group_number is None:
            G = self.observatory.star.starGraph.graph
        elif group_number in self.observatory.get_group_numbers():
            G = self.observatory.star.starGraph.components[group_number]
        else:
            raise ValueError("Group number not found in graph components list")

        fig = plt.figure(figsize=(14, 8), dpi=500)
        ax = fig.add_subplot()

        if col is not None:
            if aggregation_func is None:
                aggregation_func = lambda x: x.mean()
                aggregation_func.__name__ = "Mean"
            func_name = aggregation_func.__name__
            color_dict, colors, norm = _column_color_mapping(
                obs=self.observatory, col=col, aggregation_func=aggregation_func, G=G
            )
        else:
            color_dict, colors, norm = _group_color_mapping(obs=self.observatory, G=G)

        # --> size by number of items per node (note the *10 multiplier to increase node size)
        node_sizes = [
            len(attrs.get("membership", []) * node_size_multiple)
            for _, attrs in G.nodes(data=True)
        ]

        if k is not None or seed is not None:
            self.pos = nx.spring_layout(
                G, k=k if k is not None else 0.12, seed=seed if seed is not None else 6
            )

        # Gen Graph Viz
        nx.draw(
            G,
            pos=self.pos,
            node_color=colors,
            node_size=node_sizes,
            font_size=8,
            font_color="white",
            width=0.5,
            edgecolors="white",
        )

        ## --> create custom legend
        if not hideLegend:
            if col is None:
                unique_groups = list(set(color_dict.values()))
                legend_labels = [f"Group {group}" for group in unique_groups]
                legend_colors = plt.cm.coolwarm(norm(unique_groups))
                legend_handles = [
                    plt.Line2D(
                        [0],
                        [0],
                        marker="o",
                        color="w",
                        markerfacecolor=color,
                        markersize=10,
                        label=label,
                    )
                    for color, label in zip(legend_colors, legend_labels)
                ]
                ax.legend(
                    handles=legend_handles,
                    labels=legend_labels,
                    loc="best",
                    fontsize="small",
                )
            else:
                sm = plt.cm.ScalarMappable(cmap=plt.cm.coolwarm, norm=norm)
                sm.set_array([])
                cbar = plt.colorbar(sm, ax=ax, shrink=0.65)
                ## --> label legend to indicate node-aggregation method
                cbar.set_label(
                    f"{col} -- Node {func_name}", fontsize="small", labelpad=10
                )

        plt.show()



[docs]
    def makeHeatmap(
        self,
        nodeDescriptorCols: bool = True,
        ncols: int | list[str] = None,
        aggregation_func=None,
        topZscoreCols: bool = False,
    ):
        """
        Visualize a breakdown of your connected components!

        Parameters
        ---------
        ncols : int | List[Any], default 15
            int: the number of columns to visualize, selected from the front of your data
            list[str]: a list of specific columns from your data to create a heatmap of

        aggregation_func : np.<function>, defaults to calculating mean
            method by which to aggregate values within a node for coloring
                - color node by the median value or by the sum of values for example
            supports all numpy aggregation functions such as np.mean, np.median, np.sum, etc

        topZscoreCols : bool = False,
            visualize the ncols with the highest zscores with a group -- in other words, the columns in which one
            or more groups is the MOST different than the dataset norm

            Overrides a ncols int specification

        nodeDescriptorCols : bool = True,
            Smart select columns to view in your heatmap, based on a density representing the
            composition of a group by its nodes' descriptions.

            Overrides a ncols int specification

        Returns
        --------
        n/a : displays an inline matplotlib.plt

        ╭──────────────────────────────────────────────────────────────────────────────────────────────────────╮
        │ TODO - Dynamically select `ncols` based on cols w/ highest variance between groups for default viz   |
        ╰──────────────────────────────────────────────────────────────────────────────────────────────────────╯

        Example
        --------
        >>> tel = Telescope(star_filePath)
        >>> tel.makeHeatmap(ncols=['Pollution', 'Health Impact'], aggregation_func=np.mean)

        """
        assert isinstance(
            ncols, (int, list, type(None))
        ), "`ncols` must be an integer, a list, or None!"

        assert aggregation_func is None or callable(
            aggregation_func
        ), "aggregation_func must be a function or None"

        data = self.observatory.get_aggregatedGroupDf(aggregation_func).drop(
            columns=["Group"]
        )
        if isinstance(ncols, list):
            if all(col in self.observatory.clean.columns for col in ncols):
                data = data[ncols]
            else:
                raise UserWarning(
                    "Please ensure `ncols` contains columns from the CLEAN dataset"
                )
        ## --> dynamic col selection implemented, based on highest zscores
        elif topZscoreCols:
            zscore_cols = list(self.observatory.dataset_zscores_df(ncols).columns)
            data = data[zscore_cols]
        ## --> dynamic col selection implemented, based on node descriptors
        elif nodeDescriptorCols:
            nodeIdentifiers = self.observatory.get_group_descriptions()
            sub_keys = set()
            for _, sub_dict in nodeIdentifiers.items():
                sub_keys.update(sub_dict.keys())

            sub_keys_list = list(sub_keys)
            data = data[sub_keys_list]
        else:
            data = (
                self.observatory.get_aggregatedGroupDf(aggregation_func)
                .iloc[:, :ncols]
                .drop(columns=["Group"])
            )

        ## --> format annotations so base-data is clean/encoded/scaled and box labeling is based on the raw data
        annotations = self.observatory.get_aggregatedGroupDf(
            aggregation_func, clean=False
        ).drop(columns=["Group"])
        annotations = _match_column_order(data, annotations)
        normalized_df = _normalize_df(data)

        plt.figure(figsize=(12, 3), dpi=1000)
        ax = sns.heatmap(
            normalized_df,
            annot=annotations,
            # annot=normalized_df,
            fmt=".2f",
            # vmin=0,
            # vmax=1,
            cbar=False,
            annot_kws={"size": 7},
            cmap="coolwarm",
            linewidths=0.5,
        )

        ## --> reduce font size + format annotations to ensure text does not run out of heatmap box
        for text in ax.texts:
            value = text.get_text()
            num_digits = len(value.replace(".", ""))
            if num_digits <= 6:
                text.set_fontsize(6)
            elif 7 <= num_digits <= 5:
                text.set_fontsize(5)
            elif num_digits == 9:
                text.set_fontsize(4)

        ax.set_xticklabels(ax.get_xticklabels(), rotation=35, ha="right", fontsize=8)
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0, ha="right", fontsize=10)

        # --> to remove xaxis labels
        # plt.xticks([])

        plt.ylabel("Group Number")
        plt.show()



[docs]
    def makeSankey(
        self, score_function, dropUnclustered: bool = True, title_text: str = None
    ):
        """
        Creates a Sankey Diagram based on the score function.

        Parameters
        ----------
        score_function: function, pd:DataFrame -> List
            score_function must take in a dataframe and return a classification (categorical)
            of elements.

        Example
        ------
        Assuming data has columns "height" and "age" columns, one could define a score
        function as follows:

        ```
        def my_score_function(df):
            scores = 0.5 * df['height'] + 2 * df['age']
            labels = ['high' if score > 20 else 'low' for score in scores]
            return labels
        ```

        """

        sankey_df = pd.DataFrame()
        sankey_df["Group"] = self.observatory.data.index.map(
            self.observatory.get_items_groupID
        )
        sankey_df["Labels"] = score_function(self.observatory.data)

        if dropUnclustered:
            sankey_df = sankey_df[sankey_df["Group"] != -1]

        sankey_df = (
            sankey_df.groupby(["Group", "Labels"]).size().reset_index(name="Value")
        )

        # Create nodes from unique sources and targets
        nodes = list(set(sankey_df["Group"]).union(set(sankey_df["Labels"])))

        # Create edges
        edges = []
        for _, row in sankey_df.iterrows():
            edges.append(
                (nodes.index(row["Group"]), nodes.index(row["Labels"]), row["Value"])
            )

        ## TODO --> color TARGETS differently (red:high, blue:low for example)
        # Also add interpolation between landmark colors for larger Sankeys

        num_nodes = len(nodes)
        colors = px.colors.sample_colorscale(
            px.colors.sequential.RdBu_r, [n / (num_nodes - 1) for n in range(num_nodes)]
        )

        link_colors = _reduce_colorOpacity(colors, opacity=0.3)

        # Create Sankey diagram
        fig = go.Figure(
            data=[
                go.Sankey(
                    node=dict(
                        pad=15,
                        thickness=20,
                        line=dict(color="black", width=0.5),
                        label=nodes,
                        color=[colors[i] for i in range(len(nodes))],
                    ),
                    link=dict(
                        source=[edge[0] for edge in edges],
                        target=[edge[1] for edge in edges],
                        value=[edge[2] for edge in edges],
                        color=[link_colors[edge[0]] for edge in edges],
                    ),
                )
            ]
        )

        # Update layout
        fig.update_layout(
            title_text=title_text,
            font_size=10,
            template="simple_white",
        )

        fig.show()



[docs]
    def makePathGraph(
        self,
        col: str,
        group_number: int,
        aggregation_func=None,
        top: bool = True,
        percentage: float = 0.1,
        path_labels: bool = False,
        node_labels: bool = False,
        k: float = None,
        seed: int = None,
        node_size_multiple: int = 10,
    ):
        """
        Make a shortest-path graph by identifying sink (target) nodes and
        visualizing distance to them

        Parameters
        ----------
        col : str
            Column to color nodes by - from the raw data

        group_number : int
            graph connected component number to subset the visualization to
            For example, just show component 1 and not the entire graph

        aggregation_func : np.<function>, defaults to calculating mean
            method by which to aggregate values within a node for coloring
                - color node by the median value or by the sum of values
                for example supports all numpy aggregation functions such as
                np.mean, np.median, np.sum, etc

        top : bool, default True
            Whether to select the top n percentage or the bottom n percentage
            of nodes as target/sink nodes
            NOTE: corresponds to the `percentage` param

        percentage : float
            The n-th percentage of nodes to select as sinks/targets
            NOTE: corresponds to the `top` param

        labels : bool, False
            Add text labeling target nodes and which sink is closet
            to non-targets

        k : float, default 0.12
            value from 0-1, determines optimal distance between nodes
            setting nx.spring_layout positions

        seed : int, default 12
            Random state for deterministic node layouts, defaulted so graph
            representations are reproducible setting nx.spring_layout positions

        node_size_multiple : int, 10
            change the node sizing

        path_labels : bool, default False
            add labels to the nodes indicating target nodes, and the target
            that each node is closest to.

        node_labels : bool, default False
            add labels to the nodes, showing their node IDs for getting
            node-level data.
        """

        node_values = self.observatory.define_nodeValueDict(
            group_number, col, aggregation_func
        )
        target_nodes = sunset_dict(node_values, percentage=percentage, top=top)
        G = self.observatory.star.starGraph.components[group_number]
        nearest_target, nearest_target_distance = get_nearestTarget(
            G,
            target_nodes,
        )

        fig = plt.figure(figsize=(14, 8), dpi=500)
        ax = fig.add_subplot()
        node_sizes = [
            len(attrs.get("membership", []) * node_size_multiple)
            for _, attrs in G.nodes(data=True)
        ]
        norm = plt.Normalize(
            min(nearest_target_distance.values()),
            max(
                nearest_target_distance.values(),
            ),
        )

        if k is not None or seed is not None:
            self.pos = nx.spring_layout(
                G,
                k=k if k is not None else 0.12,
                seed=seed if seed is not None else 6,
            )

        colors = plt.cm.tab20c(norm(list(nearest_target_distance.values())))

        nx.draw(
            G,
            pos=self.pos,
            with_labels=node_labels,
            node_color=colors,
            edgecolors="white",
            node_size=node_sizes,
            font_size=8,
            font_color="white",
            width=0.5,
        )
        ## --> TODO dynamic node sizing
        target_sizes = 400
        nx.draw_networkx_nodes(
            G,
            pos=self.pos,
            nodelist=target_nodes,
            node_color="#3182BD",
            edgecolors="black",
            linewidths=1.5,
            node_size=target_sizes,
        )
        ## --> add "S" labels inside sink nodes to represent they are targets
        if not node_labels:
            for target in target_nodes:
                plt.text(
                    self.pos[target][0],
                    self.pos[target][1],
                    "S",
                    color="white",
                    fontsize=8,
                    ha="center",
                    va="center",
                    fontweight="bold",
                )

        if path_labels:
            ## --> add labels to the target nodes
            for target in target_nodes:
                plt.text(
                    self.pos[target][0],
                    self.pos[target][1],
                    "TARGET",
                    color="black",
                    fontsize=8,
                    ha="center",
                    va="center",
                )
            ## --> add labels to indicate which target is the closest to each non-target node
            for node, target in nearest_target.items():
                if target is not None:
                    if node != target:
                        plt.text(
                            self.pos[node][0],
                            self.pos[node][1],
                            f"Nearest: {target}",
                            color="black",
                            fontsize=8,
                            ha="center",
                            va="center",
                        )

        sm = plt.cm.ScalarMappable(cmap=plt.cm.tab20c, norm=norm)
        sm.set_array([])
        plt.colorbar(sm, label="Distance to Nearest Sink", shrink=0.75, ax=ax)
        plt.show()