{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitle
01Jone1000intern
12Lisa2000clerk
23Michele1000000NaN
\n", "
" ], "text/plain": [ " empid empName salary title\n", "0 1 Jone 1000 intern\n", "1 2 Lisa 2000 clerk\n", "2 3 Michele 1000000 NaN" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create sample dataframe contain null and need to cap \n", "data = {\"empid\":[1, 2, 3], \n", " \"empName\":['Jone', 'Lisa', 'Michele'], \n", " 'salary':[1000, 2000, 1000000],\n", " 'title':['intern', 'clerk', np.NAN]\n", " \n", " }\n", "df = pd.DataFrame (data)\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitle
01Jone1000intern
12Lisa2000clerk
23Michele1000000Missing
\n", "
" ], "text/plain": [ " empid empName salary title\n", "0 1 Jone 1000 intern\n", "1 2 Lisa 2000 clerk\n", "2 3 Michele 1000000 Missing" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.title = df.title.fillna(value='Missing')\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitle
01Jone1000intern
12Lisa2000clerk
23Michele5000Missing
\n", "
" ], "text/plain": [ " empid empName salary title\n", "0 1 Jone 1000 intern\n", "1 2 Lisa 2000 clerk\n", "2 3 Michele 5000 Missing" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#cap the outlier\n", "salary_new = np.where(df.salary> 5000, 5000, df.salary)\n", "df.salary = salary_new\n", "df " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from numpy.random import randint\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "class model_transformer (BaseEstimator, TransformerMixin):\n", " \n", " def fit(self, X, y=None):\n", " return self # nothing else to do \n", " def transform(self, X): \n", " X.title = X.title.fillna(value='Missing') #convert missing\n", " salary_new = np.where(X.salary> 5000, 5000, X.salary) #this can convert to function\n", " X.salary = salary_new\n", " return X\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitle
01Jone1000intern
12Lisa2000clerk
23Michele1000000NaN
\n", "
" ], "text/plain": [ " empid empName salary title\n", "0 1 Jone 1000 intern\n", "1 2 Lisa 2000 clerk\n", "2 3 Michele 1000000 NaN" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#reset df \n", "data = {\"empid\":[1, 2, 3], \n", " \"empName\":['Jone', 'Lisa', 'Michele'], \n", " 'salary':[1000, 2000, 1000000],\n", " 'title':['intern', 'clerk', np.NAN]\n", " \n", " }\n", "df = pd.DataFrame (data)\n", "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitle
01Jone1000intern
12Lisa2000clerk
23Michele5000Missing
\n", "
" ], "text/plain": [ " empid empName salary title\n", "0 1 Jone 1000 intern\n", "1 2 Lisa 2000 clerk\n", "2 3 Michele 5000 Missing" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mt = model_transformer()\n", "df1 = mt.fit_transform(df)\n", "df1" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitle
01Jone1000Missing
12Lisa5000clerk
23Michele5000Missing
\n", "
" ], "text/plain": [ " empid empName salary title\n", "0 1 Jone 1000 Missing\n", "1 2 Lisa 5000 clerk\n", "2 3 Michele 5000 Missing" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#create a dfferent dataset \n", "data2 = {\"empid\":[1, 2, 3], \n", " \"empName\":['Jone', 'Lisa', 'Michele'], \n", " 'salary':[1000, 50000, 1000000],\n", " 'title':[np.NAN, 'clerk', np.NAN]\n", " \n", " }\n", "df2 = pd.DataFrame(data2)\n", "df2_t = mt.transform(df2)\n", "df2_t" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['.\\\\customer_transformer_pickle']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#pickle the transformer\n", "#https://stackabuse.com/scikit-learn-save-and-restore-models/\n", "from joblib import dump, load\n", "dump(mt, r'.\\customer_transformer_pickle')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "__main__.model_transformer" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mt_nextime = load( r'.\\customer_transformer_pickle', mmap_mode='r')\n", "type(mt_nextime)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitlenew column
01Jone5000software engineertest
12Lisa5000Missingtest
23Michele2000vice presidenttest
\n", "
" ], "text/plain": [ " empid empName salary title new column\n", "0 1 Jone 5000 software engineer test\n", "1 2 Lisa 5000 Missing test\n", "2 3 Michele 2000 vice president test" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#additonal coumn did not affect\n", "data3 = {\"empid\":[1, 2, 3], \n", " \"empName\":['Jone', 'Lisa', 'Michele'], \n", " 'salary':[ 50000, 1000000, 2000],\n", " 'title':['software engineer', np.NAN, 'vice president'],\n", " 'new column' : ['test', 'test', 'test'] \n", " }\n", "df3 = pd.DataFrame(data3)\n", "df3_t = mt_nextime.transform(df3)\n", "df3_t\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitlebonus
01Jone50000.0software engineer7500.0
12Lisa1000000.0NaN10000.0
23Michele525000.0vice president5000.0
\n", "
" ], "text/plain": [ " empid empName salary title bonus\n", "0 1 Jone 50000.0 software engineer 7500.0\n", "1 2 Lisa 1000000.0 NaN 10000.0\n", "2 3 Michele 525000.0 vice president 5000.0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data4 = {\"empid\":[1, 2, 3], \n", " \"empName\":['Jone', 'Lisa', 'Michele'], \n", " 'salary':[ 50000, 1000000, np.NAN], # fill with median\n", " 'title':['software engineer', np.NAN, 'vice president'],\n", " 'bonus' : [np.NAN, 10000, 5000] # fill with average\n", " }\n", "df4 = pd.DataFrame(data4)\n", "\n", "fill_na_with_mean_list = [ 'bonus', 'salary',]\n", "\n", "for i in fill_na_with_mean_list:\n", " avg = df4[i].mean()\n", " df4[i] = df4[i].fillna(avg)\n", "df4" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#hard code value in condition is fine, but what if you have multiple columns need to be converted with median / mean of training set, you do NOT want to remember those by hard code\n", "\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "class model_transformer (BaseEstimator, TransformerMixin):\n", " \n", " fill_na_with_mean_list = [ 'bonus', 'salary',]\n", " \n", " def set_null_as_avg (self, df, fill_na_with_mean_list):\n", " df_out = df.copy()\n", " for i in fill_na_with_mean_list:\n", " avg = df_out[i].mean()\n", " df_out[i] = df_out[i].fillna(avg)\n", " return df_out\n", " \n", " def fit(self, X, y=None):\n", " return self # nothing else to do \n", " \n", " def transform(self, X, **transform_params):\n", " \n", " \n", " df_return= (X.pipe(self.set_null_as_avg, model_transformer.fill_na_with_mean_list )\n", " )\n", " \n", " return df_return\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitlebonus
01Jone50000.0software engineerNaN
12Lisa1000000.0NaN10000.0
23MicheleNaNvice president5000.0
\n", "
" ], "text/plain": [ " empid empName salary title bonus\n", "0 1 Jone 50000.0 software engineer NaN\n", "1 2 Lisa 1000000.0 NaN 10000.0\n", "2 3 Michele NaN vice president 5000.0" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data4 = {\"empid\":[1, 2, 3], \n", " \"empName\":['Jone', 'Lisa', 'Michele'], \n", " 'salary':[ 50000, 1000000, np.NAN], # fill with median\n", " 'title':['software engineer', np.NAN, 'vice president'],\n", " 'bonus' : [np.NAN, 10000, 5000] # fill with average\n", " }\n", "df4 = pd.DataFrame(data4)\n", "df4" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitlebonus
01Jone50000.0software engineer7500.0
12Lisa1000000.0NaN10000.0
23Michele525000.0vice president5000.0
\n", "
" ], "text/plain": [ " empid empName salary title bonus\n", "0 1 Jone 50000.0 software engineer 7500.0\n", "1 2 Lisa 1000000.0 NaN 10000.0\n", "2 3 Michele 525000.0 vice president 5000.0" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mt1 = model_transformer()\n", "df4b = mt1.fit_transform(df4)\n", "df4b" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "#imcoming test data \n", "data5 = {\"empid\":[4, 5, 6], \n", " \"empName\":['Julie', 'Ray', 'Jack'], \n", " 'salary':[ 50000, 70000, np.NAN], # fill with median\n", " 'title':['software engineer', np.NAN, 'vice president'],\n", " 'bonus' : [np.NAN, 10000, 8000] # fill with average\n", " }\n", "df5 = pd.DataFrame(data5)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitlebonus
04Julie50000.0software engineer9000.0
15Ray70000.0NaN10000.0
26Jack60000.0vice president8000.0
\n", "
" ], "text/plain": [ " empid empName salary title bonus\n", "0 4 Julie 50000.0 software engineer 9000.0\n", "1 5 Ray 70000.0 NaN 10000.0\n", "2 6 Jack 60000.0 vice president 8000.0" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5b = mt1.transform(df5)\n", "df5b\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "#it actually only get averge from test data, but your model parameter is based on training data average!!\n", "#let the class remeber the value\n", "class model_transformer2 (BaseEstimator, TransformerMixin):\n", " \n", " fill_na_with_mean_list = [ 'bonus', 'salary',]\n", " \n", " def __init__(self):\n", " self.column_mean_list = [] \n", " \n", " \n", " def set_null_as_avg (self, df, fill_na_with_mean_list):\n", " df_out = df.copy()\n", " for k, v in enumerate(fill_na_with_mean_list):\n", " avg = self.column_mean_list[k] #get value from instance variable\n", " df_out[v] = df_out[v].fillna(avg)\n", " return df_out\n", " \n", " def fit(self, X, y=None):\n", " for i in fill_na_with_mean_list:\n", " self.column_mean_list.append(X[i].mean()) #when training object is created,save the mean value\n", " return self # nothing else to do \n", " \n", " def transform(self, X, **transform_params):\n", " \n", " \n", " df_return= (X.pipe(self.set_null_as_avg, model_transformer2.fill_na_with_mean_list ) #list is class variable\n", " )\n", " \n", " return df_return\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitlebonus
01Jone50000.0software engineer7500.0
12Lisa1000000.0NaN10000.0
23Michele525000.0vice president5000.0
\n", "
" ], "text/plain": [ " empid empName salary title bonus\n", "0 1 Jone 50000.0 software engineer 7500.0\n", "1 2 Lisa 1000000.0 NaN 10000.0\n", "2 3 Michele 525000.0 vice president 5000.0" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mt2 = model_transformer2()\n", "df4c = mt2.fit_transform(df4)\n", "df4c" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
empidempNamesalarytitlebonus
04Julie50000.0software engineer7500.0
15Ray70000.0NaN10000.0
26Jack525000.0vice president8000.0
\n", "
" ], "text/plain": [ " empid empName salary title bonus\n", "0 4 Julie 50000.0 software engineer 7500.0\n", "1 5 Ray 70000.0 NaN 10000.0\n", "2 6 Jack 525000.0 vice president 8000.0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#use mt2 to transfrom new data training dtaa is 52500/7500 for salary and bonus\n", "df5c = mt2.transform(df5)\n", "df5c" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[7500.0, 525000.0]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mt2.column_mean_list" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }