In [1]:
import pandas as pd
import numpy as np


In [2]:
#create sample dataframe contain null and need to cap 
data = {"empid":[1, 2, 3], 
        "empName":['Jone', 'Lisa', 'Michele'], 
        'salary':[1000, 2000, 1000000],
        'title':['intern', 'clerk', np.NAN]
           
       }
df = pd.DataFrame (data)
df

Unnamed: 0,empid,empName,salary,title
0,1,Jone,1000,intern
1,2,Lisa,2000,clerk
2,3,Michele,1000000,


In [3]:
df.title = df.title.fillna(value='Missing')
df

Unnamed: 0,empid,empName,salary,title
0,1,Jone,1000,intern
1,2,Lisa,2000,clerk
2,3,Michele,1000000,Missing


In [4]:
#cap the outlier
salary_new = np.where(df.salary> 5000, 5000, df.salary)
df.salary = salary_new
df    

Unnamed: 0,empid,empName,salary,title
0,1,Jone,1000,intern
1,2,Lisa,2000,clerk
2,3,Michele,5000,Missing


In [5]:
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
class model_transformer (BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self  # nothing else to do   
    def transform(self, X):        
        X.title = X.title.fillna(value='Missing')   #convert missing
        salary_new = np.where(X.salary> 5000, 5000, X.salary)  #this can convert to function
        X.salary = salary_new
        return X
        
        

In [6]:
#reset df 
data = {"empid":[1, 2, 3], 
        "empName":['Jone', 'Lisa', 'Michele'], 
        'salary':[1000, 2000, 1000000],
        'title':['intern', 'clerk', np.NAN]
           
       }
df = pd.DataFrame (data)
df

Unnamed: 0,empid,empName,salary,title
0,1,Jone,1000,intern
1,2,Lisa,2000,clerk
2,3,Michele,1000000,


In [7]:
mt = model_transformer()
df1 = mt.fit_transform(df)
df1

Unnamed: 0,empid,empName,salary,title
0,1,Jone,1000,intern
1,2,Lisa,2000,clerk
2,3,Michele,5000,Missing


In [8]:
#create a dfferent dataset 
data2 = {"empid":[1, 2, 3], 
        "empName":['Jone', 'Lisa', 'Michele'], 
        'salary':[1000, 50000, 1000000],
        'title':[np.NAN, 'clerk', np.NAN]
           
       }
df2 = pd.DataFrame(data2)
df2_t = mt.transform(df2)
df2_t

Unnamed: 0,empid,empName,salary,title
0,1,Jone,1000,Missing
1,2,Lisa,5000,clerk
2,3,Michele,5000,Missing


In [9]:
#pickle the transformer
#https://stackabuse.com/scikit-learn-save-and-restore-models/
from joblib import dump, load
dump(mt, r'.\customer_transformer_pickle')

['.\\customer_transformer_pickle']

In [10]:
mt_nextime = load( r'.\customer_transformer_pickle', mmap_mode='r')
type(mt_nextime)

__main__.model_transformer

In [11]:
#additonal coumn did not affect
data3 = {"empid":[1, 2, 3], 
        "empName":['Jone', 'Lisa', 'Michele'], 
        'salary':[ 50000, 1000000, 2000],
        'title':['software engineer',  np.NAN, 'vice president'],
         'new column' : ['test', 'test', 'test']  
       }
df3 = pd.DataFrame(data3)
df3_t = mt_nextime.transform(df3)
df3_t


Unnamed: 0,empid,empName,salary,title,new column
0,1,Jone,5000,software engineer,test
1,2,Lisa,5000,Missing,test
2,3,Michele,2000,vice president,test


In [15]:
data4 =  {"empid":[1, 2, 3], 
        "empName":['Jone', 'Lisa', 'Michele'], 
        'salary':[ 50000, 1000000, np.NAN],         # fill with median
        'title':['software engineer',  np.NAN, 'vice president'],
         'bonus' : [np.NAN, 10000, 5000]    # fill with average
       }
df4 = pd.DataFrame(data4)

fill_na_with_mean_list = [ 'bonus', 'salary',]

for i in fill_na_with_mean_list:
        avg = df4[i].mean()
        df4[i] =  df4[i].fillna(avg)
df4

Unnamed: 0,empid,empName,salary,title,bonus
0,1,Jone,50000.0,software engineer,7500.0
1,2,Lisa,1000000.0,,10000.0
2,3,Michele,525000.0,vice president,5000.0


In [16]:
#hard code value in condition is fine, but what if you have multiple columns need to be converted with median / mean of training set, you do NOT want to remember those by hard code

from sklearn.base import BaseEstimator, TransformerMixin
class model_transformer (BaseEstimator, TransformerMixin):
    
    fill_na_with_mean_list = [ 'bonus', 'salary',]
    
    def set_null_as_avg (self, df, fill_na_with_mean_list):
        df_out = df.copy()
        for i in fill_na_with_mean_list:
            avg = df_out[i].mean()
            df_out[i] =  df_out[i].fillna(avg)
        return df_out
    
    def fit(self, X, y=None):
        return self  # nothing else to do   
    
    def transform(self, X, **transform_params):
        
        
        df_return= (X.pipe(self.set_null_as_avg, model_transformer.fill_na_with_mean_list )
                   )
        
        return df_return


In [27]:
data4 =  {"empid":[1, 2, 3], 
        "empName":['Jone', 'Lisa', 'Michele'], 
        'salary':[ 50000, 1000000, np.NAN],         # fill with median
        'title':['software engineer',  np.NAN, 'vice president'],
         'bonus' : [np.NAN, 10000, 5000]    # fill with average
       }
df4 = pd.DataFrame(data4)
df4

Unnamed: 0,empid,empName,salary,title,bonus
0,1,Jone,50000.0,software engineer,
1,2,Lisa,1000000.0,,10000.0
2,3,Michele,,vice president,5000.0


In [26]:
mt1 = model_transformer()
df4b = mt1.fit_transform(df4)
df4b

Unnamed: 0,empid,empName,salary,title,bonus
0,1,Jone,50000.0,software engineer,7500.0
1,2,Lisa,1000000.0,,10000.0
2,3,Michele,525000.0,vice president,5000.0


In [18]:
#imcoming test data 
data5 =  {"empid":[4, 5, 6], 
        "empName":['Julie', 'Ray', 'Jack'], 
        'salary':[ 50000, 70000, np.NAN],         # fill with median
        'title':['software engineer',  np.NAN, 'vice president'],
         'bonus' : [np.NAN, 10000, 8000]    # fill with average
       }
df5 = pd.DataFrame(data5)

In [19]:
df5b = mt1.transform(df5)
df5b


Unnamed: 0,empid,empName,salary,title,bonus
0,4,Julie,50000.0,software engineer,9000.0
1,5,Ray,70000.0,,10000.0
2,6,Jack,60000.0,vice president,8000.0


In [20]:
#it actually only get averge from test data, but your model parameter is based on training data average!!
#let the class remeber the value
class model_transformer2 (BaseEstimator, TransformerMixin):
        
    fill_na_with_mean_list = [ 'bonus', 'salary',]
    
    def __init__(self):
        self.column_mean_list = []        
    
    
    def set_null_as_avg (self, df, fill_na_with_mean_list):
        df_out = df.copy()
        for k, v in enumerate(fill_na_with_mean_list):
            avg = self.column_mean_list[k]   #get value from instance variable
            df_out[v] = df_out[v].fillna(avg)
        return df_out
    
    def fit(self, X, y=None):
        for i in fill_na_with_mean_list:
            self.column_mean_list.append(X[i].mean())   #when training object is created,save the mean value
        return self  # nothing else to do   
    
    def transform(self, X, **transform_params):
        
        
        df_return= (X.pipe(self.set_null_as_avg, model_transformer2.fill_na_with_mean_list )  #list is class variable
                   )
        
        return df_return



In [21]:
mt2 = model_transformer2()
df4c = mt2.fit_transform(df4)
df4c

Unnamed: 0,empid,empName,salary,title,bonus
0,1,Jone,50000.0,software engineer,7500.0
1,2,Lisa,1000000.0,,10000.0
2,3,Michele,525000.0,vice president,5000.0


In [22]:
#use mt2 to transfrom new data    training dtaa is 52500/7500 for salary and bonus
df5c = mt2.transform(df5)
df5c

Unnamed: 0,empid,empName,salary,title,bonus
0,4,Julie,50000.0,software engineer,7500.0
1,5,Ray,70000.0,,10000.0
2,6,Jack,525000.0,vice president,8000.0


In [23]:
mt2.column_mean_list

[7500.0, 525000.0]