Source code for thema.multiverse.system.inner.moon

# File: multiverse/system/inner/moon.py
# Last Update: 05/15/24
# Updated By: JW

import pickle
import warnings

import category_encoders as ce
import pandas as pd
from sklearn.preprocessing import StandardScaler

from ....core import Core
from . import inner_utils


[docs] class Moon(Core): """ The Moon: Modify, Omit, Oscillate and Normalize. ------------------------------------------------ The Moon data class resides cosmically near to the original raw dataset. This class handles a multitude of individual preprocessing steps helpful for smooth computation and analysis farther downstream the analysis pipeline. The intended use of this class is simplify the cleaning process and automate the production of an imputeData dataframe - a format of the data fit for more expansive exploration. The Moon class supports standard sklearn.preprocessing measures for scaling and encoding, with the primary additive feature being supported imputation methods for filling N/A values. Attributes ---------- data : pd.DataFrame A pandas dataframe of raw data. imputeData : pd.DataFrame A pandas dataframe of complete, encoded, and scaled data. encoding : list A list of encoding methods used for categorical variables. scaler : str The scaling method used. dropColumns : list A list of columns dropped from the raw data. imputeColumns : list A list of columns with missing values. imputeMethods : list A list of imputation methods used to fill missing values. seeds : int The random seed used. outDir : str The path to the output data directory. Methods ------- fit() Performs the cleaning procedure according to the constructor arguments. save(file_path) Saves the current object using pickle serialization. Examples -------- >>> data = pd.DataFrame({"A": ["Sally", "Freddy", "Johnny"], ... "B": ["cat", "dog", None], ... "C": [14, 22, 43]}) >>> data.to_pickle("myRawData") >>> data_path = "myRawData.pkl" >>> moon = Moon(data=data_path, ... dropColumns=["A"], ... encoding=["one_hot"], ... scaler="standard", ... imputeColumns=["B"], ... imputeMethod=["mode"]) >>> moon.fit() >>> moon.imputeData.to_pickle("myCleanData") """ def __init__( self, data, dropColumns=[], encoding="one_hot", scaler="standard", imputeColumns=[], imputeMethods=[], id=None, seed=None, ): """ Constructor for Moon class. Initializes a Moon object and sets cleaning parameters. Parameters ---------- data : str or pd.DataFrame The path to the raw data file or a pandas dataframe of raw data. dropColumns : list, optional A list of column names that will be dropped from the clean data. encoding : list or str, optional The encoding method(s) used for categorical variables. scaler : str, optional The scaling method used. imputeColumns : list, optional A list of column names containing missing values. imputeMethods : list, optional A list of imputation methods used to fill missing values. id : None, optional The ID of the Moon object. seed : None, optional The random seed used. """ super().__init__(data_path=data, clean_path=None, projection_path=None) self.dropColumns = dropColumns self.encoding = encoding self.scaler = scaler self.imputeColumns = imputeColumns self.imputeMethods = imputeMethods self.id = id self.seed = seed self.imputeData = None
[docs] def fit(self): """ Performs the cleaning procedure according to the constructor arguments. Initializes the imputeData member as a DataFrame, which is a scaled, numeric, and complete representation of the original raw data set. Examples ---------- >>> moon = Moon() >>> moon.fit() """ self.imputeData = inner_utils.add_imputed_flags(self.data, self.imputeColumns) for index, column in enumerate(self.imputeColumns): impute_function = getattr(inner_utils, self.imputeMethods[index]) self.imputeData[column] = impute_function(self.data[column], self.seed) self.dropColumns = [col for col in self.dropColumns if col in self.data.columns] # Drop Columns if not self.dropColumns == []: self.imputeData = self.data.drop(columns=self.dropColumns) # Drop Rows with Nans self.imputeData.dropna(axis=0, inplace=True) if type(self.encoding) == str: self.encoding = [ self.encoding for _ in range( len(self.imputeData.select_dtypes(include=["object"]).columns) ) ] # Encoding assert len(self.encoding) == len( self.imputeData.select_dtypes(include=["object"]).columns ), f"length of encoding: {len(self.encoding)}, length of cat variables: {len(self.imputeData.select_dtypes(include=['object']).columns)}" for i, column in enumerate( self.imputeData.select_dtypes(include=["object"]).columns ): encoding = self.encoding[i] if encoding == "one_hot": if self.imputeData[column].dtype == object: self.imputeData = pd.get_dummies( self.imputeData, prefix=f"OH_{column}", columns=[column] ) elif encoding == "integer": if self.imputeData[column].dtype == object: vals = self.imputeData[column].values self.imputeData[column] = inner_utils.integer_encoder(vals) elif encoding == "hash": if self.imputeData[column].dtype == object: hashing_encoder = ce.HashingEncoder(cols=[column], n_components=10) self.imputeData = hashing_encoder.fit_transform(self.imputeData) else: pass # Scaling assert self.scaler in ["standard"], "Invalid Scaler" if self.scaler == "standard": scaler = StandardScaler() self.imputeData = pd.DataFrame( scaler.fit_transform(self.imputeData), columns=list(self.imputeData.columns), )
[docs] def save(self, file_path): """ Saves the current object using pickle serialization. Parameters ---------- file_path : str The file path for the object to be written to. Examples -------- >>> moon = Moon() >>> moon.fit() >>> moon.save("myMoonObject.pkl") """ with open(file_path, "wb") as f: pickle.dump(self, f)