{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 1000 | \n",
" intern | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 2000 | \n",
" clerk | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 1000000 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title\n",
"0 1 Jone 1000 intern\n",
"1 2 Lisa 2000 clerk\n",
"2 3 Michele 1000000 NaN"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#create sample dataframe contain null and need to cap \n",
"data = {\"empid\":[1, 2, 3], \n",
" \"empName\":['Jone', 'Lisa', 'Michele'], \n",
" 'salary':[1000, 2000, 1000000],\n",
" 'title':['intern', 'clerk', np.NAN]\n",
" \n",
" }\n",
"df = pd.DataFrame (data)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 1000 | \n",
" intern | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 2000 | \n",
" clerk | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 1000000 | \n",
" Missing | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title\n",
"0 1 Jone 1000 intern\n",
"1 2 Lisa 2000 clerk\n",
"2 3 Michele 1000000 Missing"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.title = df.title.fillna(value='Missing')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 1000 | \n",
" intern | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 2000 | \n",
" clerk | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 5000 | \n",
" Missing | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title\n",
"0 1 Jone 1000 intern\n",
"1 2 Lisa 2000 clerk\n",
"2 3 Michele 5000 Missing"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#cap the outlier\n",
"salary_new = np.where(df.salary> 5000, 5000, df.salary)\n",
"df.salary = salary_new\n",
"df "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from numpy.random import randint\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"class model_transformer (BaseEstimator, TransformerMixin):\n",
" \n",
" def fit(self, X, y=None):\n",
" return self # nothing else to do \n",
" def transform(self, X): \n",
" X.title = X.title.fillna(value='Missing') #convert missing\n",
" salary_new = np.where(X.salary> 5000, 5000, X.salary) #this can convert to function\n",
" X.salary = salary_new\n",
" return X\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 1000 | \n",
" intern | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 2000 | \n",
" clerk | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 1000000 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title\n",
"0 1 Jone 1000 intern\n",
"1 2 Lisa 2000 clerk\n",
"2 3 Michele 1000000 NaN"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#reset df \n",
"data = {\"empid\":[1, 2, 3], \n",
" \"empName\":['Jone', 'Lisa', 'Michele'], \n",
" 'salary':[1000, 2000, 1000000],\n",
" 'title':['intern', 'clerk', np.NAN]\n",
" \n",
" }\n",
"df = pd.DataFrame (data)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 1000 | \n",
" intern | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 2000 | \n",
" clerk | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 5000 | \n",
" Missing | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title\n",
"0 1 Jone 1000 intern\n",
"1 2 Lisa 2000 clerk\n",
"2 3 Michele 5000 Missing"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mt = model_transformer()\n",
"df1 = mt.fit_transform(df)\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 1000 | \n",
" Missing | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 5000 | \n",
" clerk | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 5000 | \n",
" Missing | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title\n",
"0 1 Jone 1000 Missing\n",
"1 2 Lisa 5000 clerk\n",
"2 3 Michele 5000 Missing"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#create a dfferent dataset \n",
"data2 = {\"empid\":[1, 2, 3], \n",
" \"empName\":['Jone', 'Lisa', 'Michele'], \n",
" 'salary':[1000, 50000, 1000000],\n",
" 'title':[np.NAN, 'clerk', np.NAN]\n",
" \n",
" }\n",
"df2 = pd.DataFrame(data2)\n",
"df2_t = mt.transform(df2)\n",
"df2_t"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['.\\\\customer_transformer_pickle']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#pickle the transformer\n",
"#https://stackabuse.com/scikit-learn-save-and-restore-models/\n",
"from joblib import dump, load\n",
"dump(mt, r'.\\customer_transformer_pickle')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"__main__.model_transformer"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mt_nextime = load( r'.\\customer_transformer_pickle', mmap_mode='r')\n",
"type(mt_nextime)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
" new column | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 5000 | \n",
" software engineer | \n",
" test | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 5000 | \n",
" Missing | \n",
" test | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 2000 | \n",
" vice president | \n",
" test | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title new column\n",
"0 1 Jone 5000 software engineer test\n",
"1 2 Lisa 5000 Missing test\n",
"2 3 Michele 2000 vice president test"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#additonal coumn did not affect\n",
"data3 = {\"empid\":[1, 2, 3], \n",
" \"empName\":['Jone', 'Lisa', 'Michele'], \n",
" 'salary':[ 50000, 1000000, 2000],\n",
" 'title':['software engineer', np.NAN, 'vice president'],\n",
" 'new column' : ['test', 'test', 'test'] \n",
" }\n",
"df3 = pd.DataFrame(data3)\n",
"df3_t = mt_nextime.transform(df3)\n",
"df3_t\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
" bonus | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 50000.0 | \n",
" software engineer | \n",
" 7500.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 1000000.0 | \n",
" NaN | \n",
" 10000.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 525000.0 | \n",
" vice president | \n",
" 5000.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title bonus\n",
"0 1 Jone 50000.0 software engineer 7500.0\n",
"1 2 Lisa 1000000.0 NaN 10000.0\n",
"2 3 Michele 525000.0 vice president 5000.0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data4 = {\"empid\":[1, 2, 3], \n",
" \"empName\":['Jone', 'Lisa', 'Michele'], \n",
" 'salary':[ 50000, 1000000, np.NAN], # fill with median\n",
" 'title':['software engineer', np.NAN, 'vice president'],\n",
" 'bonus' : [np.NAN, 10000, 5000] # fill with average\n",
" }\n",
"df4 = pd.DataFrame(data4)\n",
"\n",
"fill_na_with_mean_list = [ 'bonus', 'salary',]\n",
"\n",
"for i in fill_na_with_mean_list:\n",
" avg = df4[i].mean()\n",
" df4[i] = df4[i].fillna(avg)\n",
"df4"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"#hard code value in condition is fine, but what if you have multiple columns need to be converted with median / mean of training set, you do NOT want to remember those by hard code\n",
"\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"class model_transformer (BaseEstimator, TransformerMixin):\n",
" \n",
" fill_na_with_mean_list = [ 'bonus', 'salary',]\n",
" \n",
" def set_null_as_avg (self, df, fill_na_with_mean_list):\n",
" df_out = df.copy()\n",
" for i in fill_na_with_mean_list:\n",
" avg = df_out[i].mean()\n",
" df_out[i] = df_out[i].fillna(avg)\n",
" return df_out\n",
" \n",
" def fit(self, X, y=None):\n",
" return self # nothing else to do \n",
" \n",
" def transform(self, X, **transform_params):\n",
" \n",
" \n",
" df_return= (X.pipe(self.set_null_as_avg, model_transformer.fill_na_with_mean_list )\n",
" )\n",
" \n",
" return df_return\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
" bonus | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 50000.0 | \n",
" software engineer | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 1000000.0 | \n",
" NaN | \n",
" 10000.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" NaN | \n",
" vice president | \n",
" 5000.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title bonus\n",
"0 1 Jone 50000.0 software engineer NaN\n",
"1 2 Lisa 1000000.0 NaN 10000.0\n",
"2 3 Michele NaN vice president 5000.0"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data4 = {\"empid\":[1, 2, 3], \n",
" \"empName\":['Jone', 'Lisa', 'Michele'], \n",
" 'salary':[ 50000, 1000000, np.NAN], # fill with median\n",
" 'title':['software engineer', np.NAN, 'vice president'],\n",
" 'bonus' : [np.NAN, 10000, 5000] # fill with average\n",
" }\n",
"df4 = pd.DataFrame(data4)\n",
"df4"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
" bonus | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 50000.0 | \n",
" software engineer | \n",
" 7500.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 1000000.0 | \n",
" NaN | \n",
" 10000.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 525000.0 | \n",
" vice president | \n",
" 5000.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title bonus\n",
"0 1 Jone 50000.0 software engineer 7500.0\n",
"1 2 Lisa 1000000.0 NaN 10000.0\n",
"2 3 Michele 525000.0 vice president 5000.0"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mt1 = model_transformer()\n",
"df4b = mt1.fit_transform(df4)\n",
"df4b"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"#imcoming test data \n",
"data5 = {\"empid\":[4, 5, 6], \n",
" \"empName\":['Julie', 'Ray', 'Jack'], \n",
" 'salary':[ 50000, 70000, np.NAN], # fill with median\n",
" 'title':['software engineer', np.NAN, 'vice president'],\n",
" 'bonus' : [np.NAN, 10000, 8000] # fill with average\n",
" }\n",
"df5 = pd.DataFrame(data5)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
" bonus | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 4 | \n",
" Julie | \n",
" 50000.0 | \n",
" software engineer | \n",
" 9000.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" Ray | \n",
" 70000.0 | \n",
" NaN | \n",
" 10000.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 6 | \n",
" Jack | \n",
" 60000.0 | \n",
" vice president | \n",
" 8000.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title bonus\n",
"0 4 Julie 50000.0 software engineer 9000.0\n",
"1 5 Ray 70000.0 NaN 10000.0\n",
"2 6 Jack 60000.0 vice president 8000.0"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5b = mt1.transform(df5)\n",
"df5b\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"#it actually only get averge from test data, but your model parameter is based on training data average!!\n",
"#let the class remeber the value\n",
"class model_transformer2 (BaseEstimator, TransformerMixin):\n",
" \n",
" fill_na_with_mean_list = [ 'bonus', 'salary',]\n",
" \n",
" def __init__(self):\n",
" self.column_mean_list = [] \n",
" \n",
" \n",
" def set_null_as_avg (self, df, fill_na_with_mean_list):\n",
" df_out = df.copy()\n",
" for k, v in enumerate(fill_na_with_mean_list):\n",
" avg = self.column_mean_list[k] #get value from instance variable\n",
" df_out[v] = df_out[v].fillna(avg)\n",
" return df_out\n",
" \n",
" def fit(self, X, y=None):\n",
" for i in fill_na_with_mean_list:\n",
" self.column_mean_list.append(X[i].mean()) #when training object is created,save the mean value\n",
" return self # nothing else to do \n",
" \n",
" def transform(self, X, **transform_params):\n",
" \n",
" \n",
" df_return= (X.pipe(self.set_null_as_avg, model_transformer2.fill_na_with_mean_list ) #list is class variable\n",
" )\n",
" \n",
" return df_return\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
" bonus | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Jone | \n",
" 50000.0 | \n",
" software engineer | \n",
" 7500.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Lisa | \n",
" 1000000.0 | \n",
" NaN | \n",
" 10000.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Michele | \n",
" 525000.0 | \n",
" vice president | \n",
" 5000.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title bonus\n",
"0 1 Jone 50000.0 software engineer 7500.0\n",
"1 2 Lisa 1000000.0 NaN 10000.0\n",
"2 3 Michele 525000.0 vice president 5000.0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mt2 = model_transformer2()\n",
"df4c = mt2.fit_transform(df4)\n",
"df4c"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" empid | \n",
" empName | \n",
" salary | \n",
" title | \n",
" bonus | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 4 | \n",
" Julie | \n",
" 50000.0 | \n",
" software engineer | \n",
" 7500.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 5 | \n",
" Ray | \n",
" 70000.0 | \n",
" NaN | \n",
" 10000.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 6 | \n",
" Jack | \n",
" 525000.0 | \n",
" vice president | \n",
" 8000.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" empid empName salary title bonus\n",
"0 4 Julie 50000.0 software engineer 7500.0\n",
"1 5 Ray 70000.0 NaN 10000.0\n",
"2 6 Jack 525000.0 vice president 8000.0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#use mt2 to transfrom new data training dtaa is 52500/7500 for salary and bonus\n",
"df5c = mt2.transform(df5)\n",
"df5c"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[7500.0, 525000.0]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mt2.column_mean_list"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}