Source code for thema.multiverse.system.inner.planet

# File: /multiverse/system/inner/planet.py
# Last Update: 05/15/24
# Updated By: JW

import os
import pickle
import random

import numpy as np
import pandas as pd
from omegaconf import ListConfig, OmegaConf

from ....core import Core
from ....utils import function_scheduler
from .inner_utils import clean_data_filename
from .moon import Moon



[docs]
class Planet(Core):
    """
    Perturb, Label And Navigate Existsing Tabulars
    ---

    Plan It. Planet!

    The Planet class lives in the --inner system-- and handles the transition
    from raw tabular data to scaled, encoded, and complete data. Specifically,
    this class is designed to handle datasets with missing values by filling
    missing values with randomly-sampled data, exploring the distribution of
    possible missing values.

    Parameters
    ----------
    data : pd.Dataframe, optional
        A pandas dataframe of raw data. Default is None.
    outDir : str, optional
        The directory path where the processed data will be saved. Default is None.
    scaler : str, optional
        The method used for scaling the data. Default is "standard".
    encoding : str or list, optional
        The method used for encoding categorical variables. Default is "one_hot" for all categorical variables.

        **Accepted Values**
        - "one_hot"
        - "integer"
        - "hash"

    dropColumns : list, optional
        A list of columns to be dropped from the data. Default is None.
    imputeMethods : list, str, optional
        A dictionary mapping column names to the imputation method to be used for each column. Default is None.

        NOTE: this parameter can take multiple types

        **Behavior**
        - imputeMethods: list
            - Will iterate overall imputation methods contained in list and create datasets that have been imputed based on the selected methods.
        - imputeMethods: "sampleNormal" -> str
            - Will use the sampleNormal method (or other).
        - imputeMethods: None
            - Will default to dropping columns with missing values, not imputing (as the imputeMethod is None).

        **Accepted Values**
        - "sampleNormal"
        - "drop"
        - "mean"
        - "median"
        - "mode"

    imputeColumns : list, optional str "all"
        A list of columns to be imputed. Default is None.

        NOTE: this parameter can take multiple types

        **Behavior**
        - imputeColumns: list
            - Will only impute the selection of data columns passed in the list.
        - imputeColumns: "all" -> str
            - Will impute all columns with missing values per the specified imputeMethods.
            - NOTE: no other string values accepted.
        - imputeColumns: None
            - Will drop all columns with missing values (ignores parameter(s) specified in imputeMethods when this is the case).

    numSamples : int, optional
        The number of samples to generate. Default is 1.
    seeds : list, optional
        A list of random seeds to use for reproducibility. Default is [42].
    verbose : bool, optional
        Whether to print progress messages. Default is False.
    YAML_PATH : str, optional
        The path to a YAML file containing configuration settings. Default is None.

    Attributes
    ----------
    data : pd.Dataframe
        A pandas dataframe of raw data.
    encoding : str or list
        The method used for encoding categorical variables.
    scaler : str
        The method used for scaling the data.
    dropColumns : list
        A list of columns dropped from the raw data.
    imputeColumns : list
        A list of impute columns.
    imputeMethods : list
        The methodology used to impute columns.
    numSamples : int
        The number of clean data frames produced when imputing.
    seeds : list
        A list of random seeds.
    outDir : str
        The path to the out data directory.
    YAML_PATH : str
        The path to the YAML parameter file.

    Methods
    -------
    get_data_path() -> str
        Returns the path to the raw data file.
    get_missingData_summary() -> dict
        Returns a dictionary summarizing missing data.
    get_recommended_sampling_method() -> list
        Returns the recommended sample method for a dataset.
    get_na_as_list() -> list
        Returns a list columns containing NaN values.
    getParams() -> dict
        Get a dictionary of parameters used in planet construction.
    writeParams_toYaml() -> None
        Saves your parameters to a YAML file.
    fit()
        Fits numSamples number of Moon objects and writes to outDir.
    save()
        Saves Planet to `.pkl` serialized object file.

    Example
    -------
    >>> data = pd.DataFrame({"A": ["Sally", "Freddy", "Johnny"],
                          "B": ["cat", "dog", None],
                          "C": [14, 22, None]})

    >>> data.to_pickle("myRawData")

    >>> data_path = "myRawData.pkl"
    >>> planet = Planet(
        data = data_path,
        outDir = "/<PATH TO OUT DIRECTORY>",
        scaler= "standard",
        encoding = "one_hot",
        dropColumns = None,
        imputeMethods = "sampleNormal",
        imputeColumns = "all",
        )

    >>> planet.fit()

    >>> planet.imputeData.to_pickle("myCleanData")
    """

    def __init__(
        self,
        data=None,
        outDir=None,
        scaler: str = "standard",
        encoding: str = "one_hot",
        dropColumns=None,
        imputeMethods=None,
        imputeColumns=None,
        numSamples: int = 1,
        seeds: list = [42],
        verbose: bool = False,
        YAML_PATH=None,
    ):
        """
        Construct a Planet instance

        Parameters
        ----------
        NOTE: all parameters can be provided via the YAML_PATH attr.

        data : str, optional
            Path to input data to be processed
        outDir : str, optional
            The directory path where the processed data will be saved. Default is None.
        scaler : str, optional
            The method used for scaling the data. Default is "standard".
        encoding : str or list
            The method used for encoding categorical variables. Default is "one_hot" for all categorical variables

            **Accepted Values**
            ```python
            "one_hot"
            "integer"
            "hash"
            ```

        dropColumns : list, optional
            A list of columns to be dropped from the data. Default is None.
        imputeMethods : list, str, optional
            A dictionary mapping column names to the imputation method to be
            used for each column. Default is None.

            NOTE: this parameter can take multiple types

            **Behavior**
            imputeMethods: list
            - will iterate overall imputation methods contained in list and
            create datasets that have been imputed based on the selected methods
            imputeMethods: "sampleNormal" -> str
            - will use the sampleNormal method (or other)
            imputeMethods: None
            - will default to dropping columns with missing values, not
            imputing (as the imputeMethod is None)

            **Accepted Values**
            ```python
            "sampleNormal"
            "drop"
            "mean"
            "median"
            "mode"
            ```

        imputeColumns : list, optional str "all"
            A list of columns to be imputed. Default is None.

            NOTE: this parameter can take multiple types

            **Behavior**
            imputeColumns: list
            - will only impute the selection of data columns passed in the list
            imputeColumns: "all" -> str
            - will impute all columns with missing values per the specified imputeMethods
            - NOTE: no other string values accepted
            imputeColumns: None
            - will drop all columns with missing values (ignores parameter(s) specified in imputeMethods when this is the case)

        numSamples : int, optional
            The number of samples to generate. Default is 1.
        seeds : list, optional
            A list of random seeds to use for reproducibility. Default is [42].
        verbose : bool
            Whether to print progress messages. Default is False.
        YAML_PATH : str, optional
            The path to a YAML file containing configuration settings. Default is None.
        """

        if YAML_PATH is None and data is None:
            raise ValueError(
                "Please provide config parameters or a path to a \
                yaml configuration file."
            )

        self.verbose = verbose
        self.YAML_PATH = None
        if YAML_PATH is not None:
            assert os.path.isfile(
                YAML_PATH
            ), f"yaml parameter file could not be found: {YAML_PATH}"

            self.YAML_PATH = YAML_PATH
            with open(YAML_PATH, "r") as f:
                params = OmegaConf.load(f)
            data = params.data
            scaler = params.Planet.scaler
            encoding = params.Planet.encoding
            dropColumns = params.Planet.dropColumns
            imputeColumns = params.Planet.imputeColumns
            imputeMethods = params.Planet.imputeMethods
            numSamples = params.Planet.numSamples
            seeds = params.Planet.seeds
            outDir = os.path.join(params.outDir, params.runName + "/clean")

        super().__init__(data_path=data, clean_path=None, projection_path=None)
        self.outDir = outDir

        if self.outDir is not None and not os.path.isdir(self.outDir):
            try:
                os.makedirs(outDir)
            except Exception as e:
                print(e)

        # HARD CODED SUPPORTED TYPED
        supported_imputeMethods = [
            "sampleNormal",
            "sampleCategorical",
            "drop",
            "mean",
            "median",
            "mode",
        ]

        self.scaler = scaler
        self.encoding = encoding

        self.numSamples = numSamples

        if seeds == "auto":
            seeds = [random.randint(0, 100) for _ in range(numSamples)]

        self.seeds = seeds
        assert numSamples > 0
        assert len(seeds) == numSamples

        assert self.scaler in ["standard"]

        if dropColumns is None or (
            type(dropColumns) == str and dropColumns.lower() == "none"
        ):
            self.dropColumns = []
        else:
            assert (
                type(dropColumns) == list or type(dropColumns) == ListConfig
            ), "dropColumns must be a list"
            self.dropColumns = dropColumns

        if imputeColumns is None or imputeColumns == "None":
            self.imputeColumns = []

        elif imputeColumns == "all":

            self.imputeColumns = self.data.columns[self.data.isna().any()].tolist()

        elif type(imputeColumns) == ListConfig or type(imputeColumns) == list:
            self.imputeColumns = imputeColumns
            for c in imputeColumns:
                if c not in self.data.columns:
                    print("Invalid impute column. Defaulting to 'None'")
                    self.imputeColumns = []
        else:
            self.imputeColumns = []

        if imputeMethods is None or imputeMethods == "None":
            self.imputeMethods = ["drop" for _ in range(len(self.imputeColumns))]

        elif type(imputeMethods) == str:
            if not imputeMethods in supported_imputeMethods:
                print("Invalid impute methods. Defaulting to 'drop'")
                imputeMethods = "drop"
                self.numSamples = 1
            self.imputeMethods = [imputeMethods for _ in range(len(self.imputeColumns))]
        else:
            assert len(imputeMethods) == len(
                self.imputeColumns
            ), f"Lengh of imputeMethods: {len(imputeMethods)} must match length of imputeColumns: {len(self.imputeColumns)}"
            for index, method in enumerate(imputeMethods):
                if not method in supported_imputeMethods:
                    print("Invalid impute methods. Defaulting to 'drop'")
                    imputeMethods[index] = "drop"
            self.imputeMethods = imputeMethods

    def _repr_html_(self):
        """
        Generate HTML representation of the Planet Class
        """
        html = """
        <style>
        .planet-table {
            display: none; /* Hide the table by default */
            font-family: Arial, sans-serif;
            border-collapse: collapse;
            width: 100%;
            max-width: 600px; /* Set maximum width for the table */
        }

        .planet-table th {
            background-color: #f2f2f2;
            border: 1px solid #dddddd;
            text-align: left;
            padding: 8px;
        }

        .planet-table td {
            border: 1px solid #dddddd;
            text-align: left;
            padding: 8px;
        }

        .planet-name {
            font-weight: bold;
            color: #333;
        }

        .planet-emoticon {
            font-size: 24px;
            cursor: pointer; /* Add cursor pointer for clickable effect */
            user-select: none; /* Disable text selection for the icon */
        }

        .planet-emoticon:hover {
            color: #007bff; /* Change color on hover for visual feedback */
        }
        </style>
        """

        # Generate a unique ID for each instance of the Planet class
        planet_id = id(self)

        html += f"<h2><span class='planet-emoticon' onclick=\"toggleTable('planet-table-{planet_id}')\">🪐</span> thema.multiverse.Planet</h2>"
        html += f"<table class='planet-table' id='planet-table-{planet_id}'>"
        for attr, value in self.getParams().items():
            html += "<tr><td class='planet-name'>{}</td><td>{}</td></tr>".format(
                attr, value
            )
        html += "</table>"

        # Add JavaScript to toggle table visibility
        html += """
        <script>
        function toggleTable(id) {
            var table = document.getElementById(id);
            if (table.style.display === 'none') {
                table.style.display = 'table';
            } else {
                table.style.display = 'none';
            }
        }
        </script>
        """

        return html


[docs]
    def get_missingData_summary(self) -> dict:
        """
        Get a summary of missing data in the columns of the 'data' dataframe.

        Returns
        -------
        summary : dict
            A dictionary containing a breakdown of columns from 'data' that are:
            - 'numericMissing': Numeric columns with missing values
            - 'numericComplete': Numeric columns without missing values
            - 'categoricalMissing': Categorical columns with missing values
            - 'categoricalComplete': Categorical columns without missing values

        Examples
        --------
        >>> data = pd.DataFrame({"A": [1, 2, None],
                                "B": [3, None, 5],
                                "C": ["a", "b", None]})

        >>> planet = Planet(data=data)
        >>> summary = planet.get_missingData_summary()
        >>> print(summary)
        {'numericMissing': ['A', 'B'], 'numericComplete': [], 'categoricalMissing': ['C'], 'categoricalComplete': ['A', 'B']}
        """

        numeric_missing = []
        numeric_not_missing = []
        categorical_missing = []
        categorical_complete = []

        for column in self.data.columns:
            if self.data[column].dtype.kind in "biufc":
                if self.data[column].isna().any():
                    numeric_missing.append(column)
                else:
                    numeric_not_missing.append(column)
            else:
                if self.data[column].isna().any():
                    categorical_missing.append(column)
                else:
                    categorical_complete.append(column)

        summary = {
            "numericMissing": numeric_missing,
            "numericComplete": numeric_not_missing,
            "categoricalMissing": categorical_missing,
            "categoricalComplete": categorical_complete,
        }

        return summary



[docs]
    def get_na_as_list(self) -> list:
        """
        Get a list of columns that contain NaN values.

        Returns
        -------
        list of str
            A list of column names that contain NaN values.

        Examples
        --------
        >>> data = pd.DataFrame({"A": [1, 2, None],
                    "B": [3, None, 5],
                    "C": ["a", "b", None]})

        >>> planet = Planet(data=data)
        >>> na_columns = planet.get_na_as_list()
        >>> print(na_columns)
        ['A', 'B', 'C']
        """
        return self.data.columns[self.data.isna().any()].tolist()



[docs]
    def get_recomended_sampling_method(self) -> list:
        """
        Get a recommended sampling method for columns with missing values.

        Returns
        -------
        list
            A list of recommended sampling methods for columns with
            missing values.
            For numeric columns, "sampleNormal" is recommended.
            For non-numeric columns, "sampleCategorical"
            (most frequent value) is recommended.

        Examples
        --------
        >>> data = pd.DataFrame({"A": [1, 2, None],
                    "B": [3, None, 5],
                    "C": ["a", "b", None]})

        >>> planet = Planet(data=data)
        >>> methods = planet.get_recommended_sampling_method()
        >>> print(methods)
        ['sampleNormal', 'sampleCategorical', 'sampleCategorical']
        """
        methods = []
        for column in self.data.columns[self.data.isna().any()].tolist():
            if pd.api.types.is_numeric_dtype(self.data[column]):
                methods.append("sampleNormal")
            else:
                methods.append("mode")

        return methods



[docs]
    def fit(self):
        """
        The meat and potatoes -- configure and run your planet object based on the specified params.

        Uses the `ProcessPoolExecutor` library to spawn multiple processes and generate results in a time-efficient manner.

        Returns
        -------
        None
            Saves numSamples of files (cleaned, imputed, scaled etc. data) to the specified outDir.

        Examples
        --------
        >>> data = pd.DataFrame({"A": ["Sally", "Freddy", "Johnny"],
                  "B": ["cat", "dog", None],
                  "C": [14, 22, None]})

        >>> data.to_pickle("myRawData")

        >>> data_path = "myRawData.pkl"
        >>> planet = Planet(
            data = data_path,
            outDir = "<PATH TO OUT DIRECTORY>",
            scaler= "standard",
            encoding = "one_hot",
            dropColumns = None,
            imputeMethods = "sampleNormal",
            imputeColumns = "all",
            )

        >>> planet.fit()

        >>> planet.imputeData.to_pickle("myCleanData")
        """
        assert len(self.seeds) == self.numSamples
        subprocesses = []
        for i in range(self.numSamples):
            cmd = (self._instantiate_moon, i)
            subprocesses.append(cmd)

        function_scheduler(
            subprocesses,
            max_workers=min(4, self.numSamples),
            out_message="SUCCESS: Imputation(s)",
            resilient=True,
            verbose=self.verbose,
        )


    def _instantiate_moon(self, id):
        """
        Helper function for the fit() method. See `fit()` for more details.

        Parameters
        ----------
        id : int
            Identifier for the Moon instance.

        Returns
        -------
        None

        Examples
        --------
        >>> planet = Planet()
        >>> planet._instantiate_moon(1)
        """

        if self.seeds is None:
            self.seeds = dict()
            self.seeds[id] = np.random.randint(0, 1000)

        my_moon = Moon(
            data=self.get_data_path(),
            dropColumns=self.dropColumns,
            encoding=self.encoding,
            scaler=self.scaler,
            imputeColumns=self.imputeColumns,
            imputeMethods=self.imputeMethods,
            seed=self.seeds[id],
            id=id,
        )
        my_moon.fit()

        filename_without_extension, extension = os.path.splitext(self.get_data_path())
        data_name = filename_without_extension.split("/")[-1]
        file_name = clean_data_filename(
            data_name=data_name,
            id=id,
            scaler=self.scaler,
            encoding=self.encoding,
        )
        output_filepath = os.path.join(self.outDir, file_name)

        my_moon.save(file_path=output_filepath)


[docs]
    def getParams(self) -> dict:
        """
        Get the parameters used to initialize the space of
        Moons around this Planet.

        Returns
        -------
        dict
            A dictionary containing the parameters used to
            initialize this specific Planet instance.

        Examples
        --------
        >>> planet = Planet()
        >>> params = planet.getParams()
        >>> print(params)
        {'data': None, 'scaler': 'standard', 'encoding': 'one_hot',
        'dropColumns': None, 'imputeColumns': None, 'imputeMethods': None,
        'numSamples': 1, 'seeds': [42], 'outDir': None}
        """

        return {
            "data": self.get_data_path(),
            "scaler": self.scaler,
            "encoding": self.encoding,
            "dropColumns": self.dropColumns,
            "imputeColumns": self.imputeColumns,
            "imputeMethods": self.imputeMethods,
            "numSamples": self.numSamples,
            "seeds": self.seeds,
            "outDir": self.outDir,
        }



[docs]
    def writeParams_toYaml(self, YAML_PATH=None):
        """
        Write the specified parameters to a YAML file.

        Parameters
        ----------
        YAML_PATH : str
            The path to an existing YAML file.

        Returns
        -------
        None

        Examples
        --------
        >>> planet = Planet()
        >>> planet.writeParams_toYaml("config.yaml")
        YAML file successfully updated
        """
        if YAML_PATH is None and self.YAML_PATH is not None:
            YAML_PATH = self.YAML_PATH
        if YAML_PATH is None and self.YAML_PATH is None:
            raise ValueError("Please provide a valid filepath to YAML")
        # Check if file exists and is correct type
        if not os.path.isfile(YAML_PATH):
            raise TypeError("File path does not point to a YAML file")

        with open(YAML_PATH, "r") as f:
            params = OmegaConf.load(f)

        params.Planet = self.getParams()
        params.Planet.pop("outDir", None)
        params.Planet.pop("data", None)

        with open(YAML_PATH, "w") as f:
            OmegaConf.save(params, f)

        print("YAML file successfully updated")



[docs]
    def save(self, file_path):
        """
        Save the current object instance to a file using pickle serialization.

        Parameters
        ----------
        file_path : str
            The path to the file where the object will be saved.

        Examples
        --------
        >>> planet = Planet()
        >>> planet.save("myPlanet.pkl")
        """
        try:
            with open(file_path, "wb") as f:
                pickle.dump(self, f)
        except Exception as e:
            print(e)