{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>1000</td>\n",
       "      <td>intern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>2000</td>\n",
       "      <td>clerk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>1000000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName   salary   title\n",
       "0      1     Jone     1000  intern\n",
       "1      2     Lisa     2000   clerk\n",
       "2      3  Michele  1000000     NaN"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#create sample dataframe contain null and need to cap \n",
    "data = {\"empid\":[1, 2, 3], \n",
    "        \"empName\":['Jone', 'Lisa', 'Michele'], \n",
    "        'salary':[1000, 2000, 1000000],\n",
    "        'title':['intern', 'clerk', np.NAN]\n",
    "           \n",
    "       }\n",
    "df = pd.DataFrame (data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>1000</td>\n",
       "      <td>intern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>2000</td>\n",
       "      <td>clerk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>1000000</td>\n",
       "      <td>Missing</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName   salary    title\n",
       "0      1     Jone     1000   intern\n",
       "1      2     Lisa     2000    clerk\n",
       "2      3  Michele  1000000  Missing"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.title = df.title.fillna(value='Missing')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>1000</td>\n",
       "      <td>intern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>2000</td>\n",
       "      <td>clerk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>5000</td>\n",
       "      <td>Missing</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName  salary    title\n",
       "0      1     Jone    1000   intern\n",
       "1      2     Lisa    2000    clerk\n",
       "2      3  Michele    5000  Missing"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#cap the outlier\n",
    "salary_new = np.where(df.salary> 5000, 5000, df.salary)\n",
    "df.salary = salary_new\n",
    "df    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy.random import randint\n",
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "class model_transformer (BaseEstimator, TransformerMixin):\n",
    "    \n",
    "    def fit(self, X, y=None):\n",
    "        return self  # nothing else to do   \n",
    "    def transform(self, X):        \n",
    "        X.title = X.title.fillna(value='Missing')   #convert missing\n",
    "        salary_new = np.where(X.salary> 5000, 5000, X.salary)  #this can convert to function\n",
    "        X.salary = salary_new\n",
    "        return X\n",
    "        \n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>1000</td>\n",
       "      <td>intern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>2000</td>\n",
       "      <td>clerk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>1000000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName   salary   title\n",
       "0      1     Jone     1000  intern\n",
       "1      2     Lisa     2000   clerk\n",
       "2      3  Michele  1000000     NaN"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#reset df \n",
    "data = {\"empid\":[1, 2, 3], \n",
    "        \"empName\":['Jone', 'Lisa', 'Michele'], \n",
    "        'salary':[1000, 2000, 1000000],\n",
    "        'title':['intern', 'clerk', np.NAN]\n",
    "           \n",
    "       }\n",
    "df = pd.DataFrame (data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>1000</td>\n",
       "      <td>intern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>2000</td>\n",
       "      <td>clerk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>5000</td>\n",
       "      <td>Missing</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName  salary    title\n",
       "0      1     Jone    1000   intern\n",
       "1      2     Lisa    2000    clerk\n",
       "2      3  Michele    5000  Missing"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mt = model_transformer()\n",
    "df1 = mt.fit_transform(df)\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>1000</td>\n",
       "      <td>Missing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>5000</td>\n",
       "      <td>clerk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>5000</td>\n",
       "      <td>Missing</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName  salary    title\n",
       "0      1     Jone    1000  Missing\n",
       "1      2     Lisa    5000    clerk\n",
       "2      3  Michele    5000  Missing"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#create a dfferent dataset \n",
    "data2 = {\"empid\":[1, 2, 3], \n",
    "        \"empName\":['Jone', 'Lisa', 'Michele'], \n",
    "        'salary':[1000, 50000, 1000000],\n",
    "        'title':[np.NAN, 'clerk', np.NAN]\n",
    "           \n",
    "       }\n",
    "df2 = pd.DataFrame(data2)\n",
    "df2_t = mt.transform(df2)\n",
    "df2_t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['.\\\\customer_transformer_pickle']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#pickle the transformer\n",
    "#https://stackabuse.com/scikit-learn-save-and-restore-models/\n",
    "from joblib import dump, load\n",
    "dump(mt, r'.\\customer_transformer_pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "__main__.model_transformer"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mt_nextime = load( r'.\\customer_transformer_pickle', mmap_mode='r')\n",
    "type(mt_nextime)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "      <th>new column</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>5000</td>\n",
       "      <td>software engineer</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>5000</td>\n",
       "      <td>Missing</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>2000</td>\n",
       "      <td>vice president</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName  salary              title new column\n",
       "0      1     Jone    5000  software engineer       test\n",
       "1      2     Lisa    5000            Missing       test\n",
       "2      3  Michele    2000     vice president       test"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#additonal coumn did not affect\n",
    "data3 = {\"empid\":[1, 2, 3], \n",
    "        \"empName\":['Jone', 'Lisa', 'Michele'], \n",
    "        'salary':[ 50000, 1000000, 2000],\n",
    "        'title':['software engineer',  np.NAN, 'vice president'],\n",
    "         'new column' : ['test', 'test', 'test']  \n",
    "       }\n",
    "df3 = pd.DataFrame(data3)\n",
    "df3_t = mt_nextime.transform(df3)\n",
    "df3_t\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "      <th>bonus</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>50000.0</td>\n",
       "      <td>software engineer</td>\n",
       "      <td>7500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>1000000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>525000.0</td>\n",
       "      <td>vice president</td>\n",
       "      <td>5000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName     salary              title    bonus\n",
       "0      1     Jone    50000.0  software engineer   7500.0\n",
       "1      2     Lisa  1000000.0                NaN  10000.0\n",
       "2      3  Michele   525000.0     vice president   5000.0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data4 =  {\"empid\":[1, 2, 3], \n",
    "        \"empName\":['Jone', 'Lisa', 'Michele'], \n",
    "        'salary':[ 50000, 1000000, np.NAN],         # fill with median\n",
    "        'title':['software engineer',  np.NAN, 'vice president'],\n",
    "         'bonus' : [np.NAN, 10000, 5000]    # fill with average\n",
    "       }\n",
    "df4 = pd.DataFrame(data4)\n",
    "\n",
    "fill_na_with_mean_list = [ 'bonus', 'salary',]\n",
    "\n",
    "for i in fill_na_with_mean_list:\n",
    "        avg = df4[i].mean()\n",
    "        df4[i] =  df4[i].fillna(avg)\n",
    "df4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hard code value in condition is fine, but what if you have multiple columns need to be converted with median / mean of training set, you do NOT want to remember those by hard code\n",
    "\n",
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "class model_transformer (BaseEstimator, TransformerMixin):\n",
    "    \n",
    "    fill_na_with_mean_list = [ 'bonus', 'salary',]\n",
    "    \n",
    "    def set_null_as_avg (self, df, fill_na_with_mean_list):\n",
    "        df_out = df.copy()\n",
    "        for i in fill_na_with_mean_list:\n",
    "            avg = df_out[i].mean()\n",
    "            df_out[i] =  df_out[i].fillna(avg)\n",
    "        return df_out\n",
    "    \n",
    "    def fit(self, X, y=None):\n",
    "        return self  # nothing else to do   \n",
    "    \n",
    "    def transform(self, X, **transform_params):\n",
    "        \n",
    "        \n",
    "        df_return= (X.pipe(self.set_null_as_avg, model_transformer.fill_na_with_mean_list )\n",
    "                   )\n",
    "        \n",
    "        return df_return\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "      <th>bonus</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>50000.0</td>\n",
       "      <td>software engineer</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>1000000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>NaN</td>\n",
       "      <td>vice president</td>\n",
       "      <td>5000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName     salary              title    bonus\n",
       "0      1     Jone    50000.0  software engineer      NaN\n",
       "1      2     Lisa  1000000.0                NaN  10000.0\n",
       "2      3  Michele        NaN     vice president   5000.0"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data4 =  {\"empid\":[1, 2, 3], \n",
    "        \"empName\":['Jone', 'Lisa', 'Michele'], \n",
    "        'salary':[ 50000, 1000000, np.NAN],         # fill with median\n",
    "        'title':['software engineer',  np.NAN, 'vice president'],\n",
    "         'bonus' : [np.NAN, 10000, 5000]    # fill with average\n",
    "       }\n",
    "df4 = pd.DataFrame(data4)\n",
    "df4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "      <th>bonus</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>50000.0</td>\n",
       "      <td>software engineer</td>\n",
       "      <td>7500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>1000000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>525000.0</td>\n",
       "      <td>vice president</td>\n",
       "      <td>5000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName     salary              title    bonus\n",
       "0      1     Jone    50000.0  software engineer   7500.0\n",
       "1      2     Lisa  1000000.0                NaN  10000.0\n",
       "2      3  Michele   525000.0     vice president   5000.0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mt1 = model_transformer()\n",
    "df4b = mt1.fit_transform(df4)\n",
    "df4b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#imcoming test data \n",
    "data5 =  {\"empid\":[4, 5, 6], \n",
    "        \"empName\":['Julie', 'Ray', 'Jack'], \n",
    "        'salary':[ 50000, 70000, np.NAN],         # fill with median\n",
    "        'title':['software engineer',  np.NAN, 'vice president'],\n",
    "         'bonus' : [np.NAN, 10000, 8000]    # fill with average\n",
    "       }\n",
    "df5 = pd.DataFrame(data5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "      <th>bonus</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>Julie</td>\n",
       "      <td>50000.0</td>\n",
       "      <td>software engineer</td>\n",
       "      <td>9000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>Ray</td>\n",
       "      <td>70000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>Jack</td>\n",
       "      <td>60000.0</td>\n",
       "      <td>vice president</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid empName   salary              title    bonus\n",
       "0      4   Julie  50000.0  software engineer   9000.0\n",
       "1      5     Ray  70000.0                NaN  10000.0\n",
       "2      6    Jack  60000.0     vice president   8000.0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df5b = mt1.transform(df5)\n",
    "df5b\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#it actually only get averge from test data, but your model parameter is based on training data average!!\n",
    "#let the class remeber the value\n",
    "class model_transformer2 (BaseEstimator, TransformerMixin):\n",
    "        \n",
    "    fill_na_with_mean_list = [ 'bonus', 'salary',]\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.column_mean_list = []        \n",
    "    \n",
    "    \n",
    "    def set_null_as_avg (self, df, fill_na_with_mean_list):\n",
    "        df_out = df.copy()\n",
    "        for k, v in enumerate(fill_na_with_mean_list):\n",
    "            avg = self.column_mean_list[k]   #get value from instance variable\n",
    "            df_out[v] = df_out[v].fillna(avg)\n",
    "        return df_out\n",
    "    \n",
    "    def fit(self, X, y=None):\n",
    "        for i in fill_na_with_mean_list:\n",
    "            self.column_mean_list.append(X[i].mean())   #when training object is created,save the mean value\n",
    "        return self  # nothing else to do   \n",
    "    \n",
    "    def transform(self, X, **transform_params):\n",
    "        \n",
    "        \n",
    "        df_return= (X.pipe(self.set_null_as_avg, model_transformer2.fill_na_with_mean_list )  #list is class variable\n",
    "                   )\n",
    "        \n",
    "        return df_return\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "      <th>bonus</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Jone</td>\n",
       "      <td>50000.0</td>\n",
       "      <td>software engineer</td>\n",
       "      <td>7500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Lisa</td>\n",
       "      <td>1000000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Michele</td>\n",
       "      <td>525000.0</td>\n",
       "      <td>vice president</td>\n",
       "      <td>5000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid  empName     salary              title    bonus\n",
       "0      1     Jone    50000.0  software engineer   7500.0\n",
       "1      2     Lisa  1000000.0                NaN  10000.0\n",
       "2      3  Michele   525000.0     vice president   5000.0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mt2 = model_transformer2()\n",
    "df4c = mt2.fit_transform(df4)\n",
    "df4c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empid</th>\n",
       "      <th>empName</th>\n",
       "      <th>salary</th>\n",
       "      <th>title</th>\n",
       "      <th>bonus</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>Julie</td>\n",
       "      <td>50000.0</td>\n",
       "      <td>software engineer</td>\n",
       "      <td>7500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>Ray</td>\n",
       "      <td>70000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>Jack</td>\n",
       "      <td>525000.0</td>\n",
       "      <td>vice president</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   empid empName    salary              title    bonus\n",
       "0      4   Julie   50000.0  software engineer   7500.0\n",
       "1      5     Ray   70000.0                NaN  10000.0\n",
       "2      6    Jack  525000.0     vice president   8000.0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#use mt2 to transfrom new data    training dtaa is 52500/7500 for salary and bonus\n",
    "df5c = mt2.transform(df5)\n",
    "df5c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[7500.0, 525000.0]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mt2.column_mean_list"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}