Source code for thema.multiverse.system.inner.planet

# File: /multiverse/system/inner/planet.py
# Last Update: 05/15/24
# Updated By: JW

import os
import pickle
import random

import numpy as np
import pandas as pd
from omegaconf import ListConfig, OmegaConf

from ....core import Core
from ....utils import function_scheduler
from .inner_utils import clean_data_filename
from .moon import Moon


[docs] class Planet(Core): """ Perturb, Label And Navigate Existsing Tabulars --- Plan It. Planet! The Planet class lives in the --inner system-- and handles the transition from raw tabular data to scaled, encoded, and complete data. Specifically, this class is designed to handle datasets with missing values by filling missing values with randomly-sampled data, exploring the distribution of possible missing values. Parameters ---------- data : pd.Dataframe, optional A pandas dataframe of raw data. Default is None. outDir : str, optional The directory path where the processed data will be saved. Default is None. scaler : str, optional The method used for scaling the data. Default is "standard". encoding : str or list, optional The method used for encoding categorical variables. Default is "one_hot" for all categorical variables. **Accepted Values** - "one_hot" - "integer" - "hash" dropColumns : list, optional A list of columns to be dropped from the data. Default is None. imputeMethods : list, str, optional A dictionary mapping column names to the imputation method to be used for each column. Default is None. NOTE: this parameter can take multiple types **Behavior** - imputeMethods: list - Will iterate overall imputation methods contained in list and create datasets that have been imputed based on the selected methods. - imputeMethods: "sampleNormal" -> str - Will use the sampleNormal method (or other). - imputeMethods: None - Will default to dropping columns with missing values, not imputing (as the imputeMethod is None). **Accepted Values** - "sampleNormal" - "drop" - "mean" - "median" - "mode" imputeColumns : list, optional str "all" A list of columns to be imputed. Default is None. NOTE: this parameter can take multiple types **Behavior** - imputeColumns: list - Will only impute the selection of data columns passed in the list. - imputeColumns: "all" -> str - Will impute all columns with missing values per the specified imputeMethods. - NOTE: no other string values accepted. - imputeColumns: None - Will drop all columns with missing values (ignores parameter(s) specified in imputeMethods when this is the case). numSamples : int, optional The number of samples to generate. Default is 1. seeds : list, optional A list of random seeds to use for reproducibility. Default is [42]. verbose : bool, optional Whether to print progress messages. Default is False. YAML_PATH : str, optional The path to a YAML file containing configuration settings. Default is None. Attributes ---------- data : pd.Dataframe A pandas dataframe of raw data. encoding : str or list The method used for encoding categorical variables. scaler : str The method used for scaling the data. dropColumns : list A list of columns dropped from the raw data. imputeColumns : list A list of impute columns. imputeMethods : list The methodology used to impute columns. numSamples : int The number of clean data frames produced when imputing. seeds : list A list of random seeds. outDir : str The path to the out data directory. YAML_PATH : str The path to the YAML parameter file. Methods ------- get_data_path() -> str Returns the path to the raw data file. get_missingData_summary() -> dict Returns a dictionary summarizing missing data. get_recommended_sampling_method() -> list Returns the recommended sample method for a dataset. get_na_as_list() -> list Returns a list columns containing NaN values. getParams() -> dict Get a dictionary of parameters used in planet construction. writeParams_toYaml() -> None Saves your parameters to a YAML file. fit() Fits numSamples number of Moon objects and writes to outDir. save() Saves Planet to `.pkl` serialized object file. Example ------- >>> data = pd.DataFrame({"A": ["Sally", "Freddy", "Johnny"], "B": ["cat", "dog", None], "C": [14, 22, None]}) >>> data.to_pickle("myRawData") >>> data_path = "myRawData.pkl" >>> planet = Planet( data = data_path, outDir = "/<PATH TO OUT DIRECTORY>", scaler= "standard", encoding = "one_hot", dropColumns = None, imputeMethods = "sampleNormal", imputeColumns = "all", ) >>> planet.fit() >>> planet.imputeData.to_pickle("myCleanData") """ def __init__( self, data=None, outDir=None, scaler: str = "standard", encoding: str = "one_hot", dropColumns=None, imputeMethods=None, imputeColumns=None, numSamples: int = 1, seeds: list = [42], verbose: bool = False, YAML_PATH=None, ): """ Construct a Planet instance Parameters ---------- NOTE: all parameters can be provided via the YAML_PATH attr. data : str, optional Path to input data to be processed outDir : str, optional The directory path where the processed data will be saved. Default is None. scaler : str, optional The method used for scaling the data. Default is "standard". encoding : str or list The method used for encoding categorical variables. Default is "one_hot" for all categorical variables **Accepted Values** ```python "one_hot" "integer" "hash" ``` dropColumns : list, optional A list of columns to be dropped from the data. Default is None. imputeMethods : list, str, optional A dictionary mapping column names to the imputation method to be used for each column. Default is None. NOTE: this parameter can take multiple types **Behavior** imputeMethods: list - will iterate overall imputation methods contained in list and create datasets that have been imputed based on the selected methods imputeMethods: "sampleNormal" -> str - will use the sampleNormal method (or other) imputeMethods: None - will default to dropping columns with missing values, not imputing (as the imputeMethod is None) **Accepted Values** ```python "sampleNormal" "drop" "mean" "median" "mode" ``` imputeColumns : list, optional str "all" A list of columns to be imputed. Default is None. NOTE: this parameter can take multiple types **Behavior** imputeColumns: list - will only impute the selection of data columns passed in the list imputeColumns: "all" -> str - will impute all columns with missing values per the specified imputeMethods - NOTE: no other string values accepted imputeColumns: None - will drop all columns with missing values (ignores parameter(s) specified in imputeMethods when this is the case) numSamples : int, optional The number of samples to generate. Default is 1. seeds : list, optional A list of random seeds to use for reproducibility. Default is [42]. verbose : bool Whether to print progress messages. Default is False. YAML_PATH : str, optional The path to a YAML file containing configuration settings. Default is None. """ if YAML_PATH is None and data is None: raise ValueError( "Please provide config parameters or a path to a \ yaml configuration file." ) self.verbose = verbose self.YAML_PATH = None if YAML_PATH is not None: assert os.path.isfile( YAML_PATH ), f"yaml parameter file could not be found: {YAML_PATH}" self.YAML_PATH = YAML_PATH with open(YAML_PATH, "r") as f: params = OmegaConf.load(f) data = params.data scaler = params.Planet.scaler encoding = params.Planet.encoding dropColumns = params.Planet.dropColumns imputeColumns = params.Planet.imputeColumns imputeMethods = params.Planet.imputeMethods numSamples = params.Planet.numSamples seeds = params.Planet.seeds outDir = os.path.join(params.outDir, params.runName + "/clean") super().__init__(data_path=data, clean_path=None, projection_path=None) self.outDir = outDir if self.outDir is not None and not os.path.isdir(self.outDir): try: os.makedirs(outDir) except Exception as e: print(e) # HARD CODED SUPPORTED TYPED supported_imputeMethods = [ "sampleNormal", "sampleCategorical", "drop", "mean", "median", "mode", ] self.scaler = scaler self.encoding = encoding self.numSamples = numSamples if seeds == "auto": seeds = [random.randint(0, 100) for _ in range(numSamples)] self.seeds = seeds assert numSamples > 0 assert len(seeds) == numSamples assert self.scaler in ["standard"] if dropColumns is None or ( type(dropColumns) == str and dropColumns.lower() == "none" ): self.dropColumns = [] else: assert ( type(dropColumns) == list or type(dropColumns) == ListConfig ), "dropColumns must be a list" self.dropColumns = dropColumns if imputeColumns is None or imputeColumns == "None": self.imputeColumns = [] elif imputeColumns == "all": self.imputeColumns = self.data.columns[self.data.isna().any()].tolist() elif type(imputeColumns) == ListConfig or type(imputeColumns) == list: self.imputeColumns = imputeColumns for c in imputeColumns: if c not in self.data.columns: print("Invalid impute column. Defaulting to 'None'") self.imputeColumns = [] else: self.imputeColumns = [] if imputeMethods is None or imputeMethods == "None": self.imputeMethods = ["drop" for _ in range(len(self.imputeColumns))] elif type(imputeMethods) == str: if not imputeMethods in supported_imputeMethods: print("Invalid impute methods. Defaulting to 'drop'") imputeMethods = "drop" self.numSamples = 1 self.imputeMethods = [imputeMethods for _ in range(len(self.imputeColumns))] else: assert len(imputeMethods) == len( self.imputeColumns ), f"Lengh of imputeMethods: {len(imputeMethods)} must match length of imputeColumns: {len(self.imputeColumns)}" for index, method in enumerate(imputeMethods): if not method in supported_imputeMethods: print("Invalid impute methods. Defaulting to 'drop'") imputeMethods[index] = "drop" self.imputeMethods = imputeMethods def _repr_html_(self): """ Generate HTML representation of the Planet Class """ html = """ <style> .planet-table { display: none; /* Hide the table by default */ font-family: Arial, sans-serif; border-collapse: collapse; width: 100%; max-width: 600px; /* Set maximum width for the table */ } .planet-table th { background-color: #f2f2f2; border: 1px solid #dddddd; text-align: left; padding: 8px; } .planet-table td { border: 1px solid #dddddd; text-align: left; padding: 8px; } .planet-name { font-weight: bold; color: #333; } .planet-emoticon { font-size: 24px; cursor: pointer; /* Add cursor pointer for clickable effect */ user-select: none; /* Disable text selection for the icon */ } .planet-emoticon:hover { color: #007bff; /* Change color on hover for visual feedback */ } </style> """ # Generate a unique ID for each instance of the Planet class planet_id = id(self) html += f"<h2><span class='planet-emoticon' onclick=\"toggleTable('planet-table-{planet_id}')\">🪐</span> thema.multiverse.Planet</h2>" html += f"<table class='planet-table' id='planet-table-{planet_id}'>" for attr, value in self.getParams().items(): html += "<tr><td class='planet-name'>{}</td><td>{}</td></tr>".format( attr, value ) html += "</table>" # Add JavaScript to toggle table visibility html += """ <script> function toggleTable(id) { var table = document.getElementById(id); if (table.style.display === 'none') { table.style.display = 'table'; } else { table.style.display = 'none'; } } </script> """ return html
[docs] def get_missingData_summary(self) -> dict: """ Get a summary of missing data in the columns of the 'data' dataframe. Returns ------- summary : dict A dictionary containing a breakdown of columns from 'data' that are: - 'numericMissing': Numeric columns with missing values - 'numericComplete': Numeric columns without missing values - 'categoricalMissing': Categorical columns with missing values - 'categoricalComplete': Categorical columns without missing values Examples -------- >>> data = pd.DataFrame({"A": [1, 2, None], "B": [3, None, 5], "C": ["a", "b", None]}) >>> planet = Planet(data=data) >>> summary = planet.get_missingData_summary() >>> print(summary) {'numericMissing': ['A', 'B'], 'numericComplete': [], 'categoricalMissing': ['C'], 'categoricalComplete': ['A', 'B']} """ numeric_missing = [] numeric_not_missing = [] categorical_missing = [] categorical_complete = [] for column in self.data.columns: if self.data[column].dtype.kind in "biufc": if self.data[column].isna().any(): numeric_missing.append(column) else: numeric_not_missing.append(column) else: if self.data[column].isna().any(): categorical_missing.append(column) else: categorical_complete.append(column) summary = { "numericMissing": numeric_missing, "numericComplete": numeric_not_missing, "categoricalMissing": categorical_missing, "categoricalComplete": categorical_complete, } return summary
[docs] def get_na_as_list(self) -> list: """ Get a list of columns that contain NaN values. Returns ------- list of str A list of column names that contain NaN values. Examples -------- >>> data = pd.DataFrame({"A": [1, 2, None], "B": [3, None, 5], "C": ["a", "b", None]}) >>> planet = Planet(data=data) >>> na_columns = planet.get_na_as_list() >>> print(na_columns) ['A', 'B', 'C'] """ return self.data.columns[self.data.isna().any()].tolist()
[docs] def get_recomended_sampling_method(self) -> list: """ Get a recommended sampling method for columns with missing values. Returns ------- list A list of recommended sampling methods for columns with missing values. For numeric columns, "sampleNormal" is recommended. For non-numeric columns, "sampleCategorical" (most frequent value) is recommended. Examples -------- >>> data = pd.DataFrame({"A": [1, 2, None], "B": [3, None, 5], "C": ["a", "b", None]}) >>> planet = Planet(data=data) >>> methods = planet.get_recommended_sampling_method() >>> print(methods) ['sampleNormal', 'sampleCategorical', 'sampleCategorical'] """ methods = [] for column in self.data.columns[self.data.isna().any()].tolist(): if pd.api.types.is_numeric_dtype(self.data[column]): methods.append("sampleNormal") else: methods.append("mode") return methods
[docs] def fit(self): """ The meat and potatoes -- configure and run your planet object based on the specified params. Uses the `ProcessPoolExecutor` library to spawn multiple processes and generate results in a time-efficient manner. Returns ------- None Saves numSamples of files (cleaned, imputed, scaled etc. data) to the specified outDir. Examples -------- >>> data = pd.DataFrame({"A": ["Sally", "Freddy", "Johnny"], "B": ["cat", "dog", None], "C": [14, 22, None]}) >>> data.to_pickle("myRawData") >>> data_path = "myRawData.pkl" >>> planet = Planet( data = data_path, outDir = "<PATH TO OUT DIRECTORY>", scaler= "standard", encoding = "one_hot", dropColumns = None, imputeMethods = "sampleNormal", imputeColumns = "all", ) >>> planet.fit() >>> planet.imputeData.to_pickle("myCleanData") """ assert len(self.seeds) == self.numSamples subprocesses = [] for i in range(self.numSamples): cmd = (self._instantiate_moon, i) subprocesses.append(cmd) function_scheduler( subprocesses, max_workers=min(4, self.numSamples), out_message="SUCCESS: Imputation(s)", resilient=True, verbose=self.verbose, )
def _instantiate_moon(self, id): """ Helper function for the fit() method. See `fit()` for more details. Parameters ---------- id : int Identifier for the Moon instance. Returns ------- None Examples -------- >>> planet = Planet() >>> planet._instantiate_moon(1) """ if self.seeds is None: self.seeds = dict() self.seeds[id] = np.random.randint(0, 1000) my_moon = Moon( data=self.get_data_path(), dropColumns=self.dropColumns, encoding=self.encoding, scaler=self.scaler, imputeColumns=self.imputeColumns, imputeMethods=self.imputeMethods, seed=self.seeds[id], id=id, ) my_moon.fit() filename_without_extension, extension = os.path.splitext(self.get_data_path()) data_name = filename_without_extension.split("/")[-1] file_name = clean_data_filename( data_name=data_name, id=id, scaler=self.scaler, encoding=self.encoding, ) output_filepath = os.path.join(self.outDir, file_name) my_moon.save(file_path=output_filepath)
[docs] def getParams(self) -> dict: """ Get the parameters used to initialize the space of Moons around this Planet. Returns ------- dict A dictionary containing the parameters used to initialize this specific Planet instance. Examples -------- >>> planet = Planet() >>> params = planet.getParams() >>> print(params) {'data': None, 'scaler': 'standard', 'encoding': 'one_hot', 'dropColumns': None, 'imputeColumns': None, 'imputeMethods': None, 'numSamples': 1, 'seeds': [42], 'outDir': None} """ return { "data": self.get_data_path(), "scaler": self.scaler, "encoding": self.encoding, "dropColumns": self.dropColumns, "imputeColumns": self.imputeColumns, "imputeMethods": self.imputeMethods, "numSamples": self.numSamples, "seeds": self.seeds, "outDir": self.outDir, }
[docs] def writeParams_toYaml(self, YAML_PATH=None): """ Write the specified parameters to a YAML file. Parameters ---------- YAML_PATH : str The path to an existing YAML file. Returns ------- None Examples -------- >>> planet = Planet() >>> planet.writeParams_toYaml("config.yaml") YAML file successfully updated """ if YAML_PATH is None and self.YAML_PATH is not None: YAML_PATH = self.YAML_PATH if YAML_PATH is None and self.YAML_PATH is None: raise ValueError("Please provide a valid filepath to YAML") # Check if file exists and is correct type if not os.path.isfile(YAML_PATH): raise TypeError("File path does not point to a YAML file") with open(YAML_PATH, "r") as f: params = OmegaConf.load(f) params.Planet = self.getParams() params.Planet.pop("outDir", None) params.Planet.pop("data", None) with open(YAML_PATH, "w") as f: OmegaConf.save(params, f) print("YAML file successfully updated")
[docs] def save(self, file_path): """ Save the current object instance to a file using pickle serialization. Parameters ---------- file_path : str The path to the file where the object will be saved. Examples -------- >>> planet = Planet() >>> planet.save("myPlanet.pkl") """ try: with open(file_path, "wb") as f: pickle.dump(self, f) except Exception as e: print(e)