{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#import packages\n",
"import pandas as pd\n",
"import numpy as np\n",
"import sys\n",
"%matplotlib inline\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"#import file containing function\n",
"sys.path.append(r'.\\common_function') #add path to default module search path\n",
"from EDA_function import *\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** load data from seaborn "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['anagrams',\n",
" 'anscombe',\n",
" 'attention',\n",
" 'brain_networks',\n",
" 'car_crashes',\n",
" 'diamonds',\n",
" 'dots',\n",
" 'exercise',\n",
" 'flights',\n",
" 'fmri',\n",
" 'gammas',\n",
" 'geyser',\n",
" 'iris',\n",
" 'mpg',\n",
" 'penguins',\n",
" 'planets',\n",
" 'taxis',\n",
" 'tips',\n",
" 'titanic']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sns.get_dataset_names()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" survived \n",
" pclass \n",
" sex \n",
" age \n",
" sibsp \n",
" parch \n",
" fare \n",
" embarked \n",
" class \n",
" who \n",
" adult_male \n",
" deck \n",
" embark_town \n",
" alive \n",
" alone \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0 \n",
" 3 \n",
" male \n",
" 22.0 \n",
" 1 \n",
" 0 \n",
" 7.2500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 1 \n",
" 1 \n",
" 1 \n",
" female \n",
" 38.0 \n",
" 1 \n",
" 0 \n",
" 71.2833 \n",
" C \n",
" First \n",
" woman \n",
" False \n",
" C \n",
" Cherbourg \n",
" yes \n",
" False \n",
" \n",
" \n",
" 2 \n",
" 1 \n",
" 3 \n",
" female \n",
" 26.0 \n",
" 0 \n",
" 0 \n",
" 7.9250 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" yes \n",
" True \n",
" \n",
" \n",
" 3 \n",
" 1 \n",
" 1 \n",
" female \n",
" 35.0 \n",
" 1 \n",
" 0 \n",
" 53.1000 \n",
" S \n",
" First \n",
" woman \n",
" False \n",
" C \n",
" Southampton \n",
" yes \n",
" False \n",
" \n",
" \n",
" 4 \n",
" 0 \n",
" 3 \n",
" male \n",
" 35.0 \n",
" 0 \n",
" 0 \n",
" 8.0500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived pclass sex age sibsp parch fare embarked class \\\n",
"0 0 3 male 22.0 1 0 7.2500 S Third \n",
"1 1 1 female 38.0 1 0 71.2833 C First \n",
"2 1 3 female 26.0 0 0 7.9250 S Third \n",
"3 1 1 female 35.0 1 0 53.1000 S First \n",
"4 0 3 male 35.0 0 0 8.0500 S Third \n",
"\n",
" who adult_male deck embark_town alive alone \n",
"0 man True NaN Southampton no False \n",
"1 woman False C Cherbourg yes False \n",
"2 woman False NaN Southampton yes True \n",
"3 woman False C Southampton yes False \n",
"4 man True NaN Southampton no True "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = sns.load_dataset('titanic')\n",
"df_copy =df.copy() # keep a copy just in case\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\n1. overall EDA\\n2. further EDA on data variation and dup\\n3. further EDA on data missing or inf (inlier)\\n4. further EDA on data with outlier \\n5. further EDA on feature correlation\\n6. further EAD on feature selection\\n'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"1. overall EDA\n",
"2. further EDA on data variation and dup\n",
"3. further EDA on data missing or inf (inlier)\n",
"4. further EDA on data with outlier \n",
"5. further EDA on feature correlation\n",
"6. further EAD on feature selection\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1mGet list of column, data type and see if there are data missing\u001b[0m\n",
"\n",
"RangeIndex: 891 entries, 0 to 890\n",
"Data columns (total 15 columns):\n",
"survived 891 non-null int64\n",
"pclass 891 non-null int64\n",
"sex 891 non-null object\n",
"age 714 non-null float64\n",
"sibsp 891 non-null int64\n",
"parch 891 non-null int64\n",
"fare 891 non-null float64\n",
"embarked 889 non-null object\n",
"class 891 non-null category\n",
"who 891 non-null object\n",
"adult_male 891 non-null bool\n",
"deck 203 non-null category\n",
"embark_town 889 non-null object\n",
"alive 891 non-null object\n",
"alone 891 non-null bool\n",
"dtypes: bool(2), category(2), float64(2), int64(4), object(5)\n",
"memory usage: 80.6+ KB\n"
]
},
{
"data": {
"text/plain": [
"None"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1mGet descriptive statistics for numeric column \u001b[0m\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" survived \n",
" pclass \n",
" age \n",
" sibsp \n",
" parch \n",
" fare \n",
" \n",
" \n",
" \n",
" \n",
" count \n",
" 891.000000 \n",
" 891.000000 \n",
" 714.000000 \n",
" 891.000000 \n",
" 891.000000 \n",
" 891.000000 \n",
" \n",
" \n",
" mean \n",
" 0.383838 \n",
" 2.308642 \n",
" 29.699118 \n",
" 0.523008 \n",
" 0.381594 \n",
" 32.204208 \n",
" \n",
" \n",
" std \n",
" 0.486592 \n",
" 0.836071 \n",
" 14.526497 \n",
" 1.102743 \n",
" 0.806057 \n",
" 49.693429 \n",
" \n",
" \n",
" min \n",
" 0.000000 \n",
" 1.000000 \n",
" 0.420000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" \n",
" \n",
" 25% \n",
" 0.000000 \n",
" 2.000000 \n",
" 20.125000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 7.910400 \n",
" \n",
" \n",
" 50% \n",
" 0.000000 \n",
" 3.000000 \n",
" 28.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 14.454200 \n",
" \n",
" \n",
" 75% \n",
" 1.000000 \n",
" 3.000000 \n",
" 38.000000 \n",
" 1.000000 \n",
" 0.000000 \n",
" 31.000000 \n",
" \n",
" \n",
" max \n",
" 1.000000 \n",
" 3.000000 \n",
" 80.000000 \n",
" 8.000000 \n",
" 6.000000 \n",
" 512.329200 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived pclass age sibsp parch fare\n",
"count 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000\n",
"mean 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208\n",
"std 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429\n",
"min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000\n",
"25% 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400\n",
"50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200\n",
"75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000\n",
"max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 1. overall EDA\n",
"eda_overall(df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" survived \n",
" pclass \n",
" sex \n",
" age \n",
" sibsp \n",
" parch \n",
" fare \n",
" embarked \n",
" class \n",
" who \n",
" adult_male \n",
" deck \n",
" embark_town \n",
" alive \n",
" alone \n",
" \n",
" \n",
" \n",
" \n",
" 413 \n",
" 0 \n",
" 2 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 0.0000 \n",
" S \n",
" Second \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 466 \n",
" 0 \n",
" 2 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 0.0000 \n",
" S \n",
" Second \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 674 \n",
" 0 \n",
" 2 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 0.0000 \n",
" S \n",
" Second \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 732 \n",
" 0 \n",
" 2 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 0.0000 \n",
" S \n",
" Second \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 481 \n",
" 0 \n",
" 2 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 0.0000 \n",
" S \n",
" Second \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 277 \n",
" 0 \n",
" 2 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 0.0000 \n",
" S \n",
" Second \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 784 \n",
" 0 \n",
" 3 \n",
" male \n",
" 25.00 \n",
" 0 \n",
" 0 \n",
" 7.0500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 884 \n",
" 0 \n",
" 3 \n",
" male \n",
" 25.00 \n",
" 0 \n",
" 0 \n",
" 7.0500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 773 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2250 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 26 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2250 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 522 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2250 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 354 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2250 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 598 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2250 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 832 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2292 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 568 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2292 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 531 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2292 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 859 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2292 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 524 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2292 \n",
" C \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 320 \n",
" 0 \n",
" 3 \n",
" male \n",
" 22.00 \n",
" 0 \n",
" 0 \n",
" 7.2500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 250 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 470 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 212 \n",
" 0 \n",
" 3 \n",
" male \n",
" 22.00 \n",
" 0 \n",
" 0 \n",
" 7.2500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 425 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.2500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" True \n",
" \n",
" \n",
" 274 \n",
" 1 \n",
" 3 \n",
" female \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.7500 \n",
" Q \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Queenstown \n",
" yes \n",
" True \n",
" \n",
" \n",
" 260 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.7500 \n",
" Q \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Queenstown \n",
" no \n",
" True \n",
" \n",
" \n",
" 368 \n",
" 1 \n",
" 3 \n",
" female \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.7500 \n",
" Q \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Queenstown \n",
" yes \n",
" True \n",
" \n",
" \n",
" 198 \n",
" 1 \n",
" 3 \n",
" female \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.7500 \n",
" Q \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Queenstown \n",
" yes \n",
" True \n",
" \n",
" \n",
" 196 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.7500 \n",
" Q \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Queenstown \n",
" no \n",
" True \n",
" \n",
" \n",
" 790 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.7500 \n",
" Q \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Queenstown \n",
" no \n",
" True \n",
" \n",
" \n",
" 428 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 7.7500 \n",
" Q \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Queenstown \n",
" no \n",
" True \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 364 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 1 \n",
" 0 \n",
" 15.5000 \n",
" Q \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Queenstown \n",
" no \n",
" False \n",
" \n",
" \n",
" 241 \n",
" 1 \n",
" 3 \n",
" female \n",
" NaN \n",
" 1 \n",
" 0 \n",
" 15.5000 \n",
" Q \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Queenstown \n",
" yes \n",
" False \n",
" \n",
" \n",
" 347 \n",
" 1 \n",
" 3 \n",
" female \n",
" NaN \n",
" 1 \n",
" 0 \n",
" 16.1000 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" yes \n",
" False \n",
" \n",
" \n",
" 431 \n",
" 1 \n",
" 3 \n",
" female \n",
" NaN \n",
" 1 \n",
" 0 \n",
" 16.1000 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" yes \n",
" False \n",
" \n",
" \n",
" 469 \n",
" 1 \n",
" 3 \n",
" female \n",
" 0.75 \n",
" 2 \n",
" 1 \n",
" 19.2583 \n",
" C \n",
" Third \n",
" child \n",
" False \n",
" NaN \n",
" Cherbourg \n",
" yes \n",
" False \n",
" \n",
" \n",
" 644 \n",
" 1 \n",
" 3 \n",
" female \n",
" 0.75 \n",
" 2 \n",
" 1 \n",
" 19.2583 \n",
" C \n",
" Third \n",
" child \n",
" False \n",
" NaN \n",
" Cherbourg \n",
" yes \n",
" False \n",
" \n",
" \n",
" 451 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 1 \n",
" 0 \n",
" 19.9667 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 490 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 1 \n",
" 0 \n",
" 19.9667 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 405 \n",
" 0 \n",
" 2 \n",
" male \n",
" 34.00 \n",
" 1 \n",
" 0 \n",
" 21.0000 \n",
" S \n",
" Second \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 476 \n",
" 0 \n",
" 2 \n",
" male \n",
" 34.00 \n",
" 1 \n",
" 0 \n",
" 21.0000 \n",
" S \n",
" Second \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 409 \n",
" 0 \n",
" 3 \n",
" female \n",
" NaN \n",
" 3 \n",
" 1 \n",
" 25.4667 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 229 \n",
" 0 \n",
" 3 \n",
" female \n",
" NaN \n",
" 3 \n",
" 1 \n",
" 25.4667 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 485 \n",
" 0 \n",
" 3 \n",
" female \n",
" NaN \n",
" 3 \n",
" 1 \n",
" 25.4667 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 133 \n",
" 1 \n",
" 2 \n",
" female \n",
" 29.00 \n",
" 1 \n",
" 0 \n",
" 26.0000 \n",
" S \n",
" Second \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" yes \n",
" False \n",
" \n",
" \n",
" 53 \n",
" 1 \n",
" 2 \n",
" female \n",
" 29.00 \n",
" 1 \n",
" 0 \n",
" 26.0000 \n",
" S \n",
" Second \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" yes \n",
" False \n",
" \n",
" \n",
" 64 \n",
" 0 \n",
" 1 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 27.7208 \n",
" C \n",
" First \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 295 \n",
" 0 \n",
" 1 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 27.7208 \n",
" C \n",
" First \n",
" man \n",
" True \n",
" NaN \n",
" Cherbourg \n",
" no \n",
" True \n",
" \n",
" \n",
" 74 \n",
" 1 \n",
" 3 \n",
" male \n",
" 32.00 \n",
" 0 \n",
" 0 \n",
" 56.4958 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" yes \n",
" True \n",
" \n",
" \n",
" 838 \n",
" 1 \n",
" 3 \n",
" male \n",
" 32.00 \n",
" 0 \n",
" 0 \n",
" 56.4958 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" yes \n",
" True \n",
" \n",
" \n",
" 692 \n",
" 1 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 56.4958 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" yes \n",
" True \n",
" \n",
" \n",
" 643 \n",
" 1 \n",
" 3 \n",
" male \n",
" NaN \n",
" 0 \n",
" 0 \n",
" 56.4958 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" yes \n",
" True \n",
" \n",
" \n",
" 641 \n",
" 1 \n",
" 1 \n",
" female \n",
" 24.00 \n",
" 0 \n",
" 0 \n",
" 69.3000 \n",
" C \n",
" First \n",
" woman \n",
" False \n",
" B \n",
" Cherbourg \n",
" yes \n",
" True \n",
" \n",
" \n",
" 369 \n",
" 1 \n",
" 1 \n",
" female \n",
" 24.00 \n",
" 0 \n",
" 0 \n",
" 69.3000 \n",
" C \n",
" First \n",
" woman \n",
" False \n",
" B \n",
" Cherbourg \n",
" yes \n",
" True \n",
" \n",
" \n",
" 846 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 8 \n",
" 2 \n",
" 69.5500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 792 \n",
" 0 \n",
" 3 \n",
" female \n",
" NaN \n",
" 8 \n",
" 2 \n",
" 69.5500 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 863 \n",
" 0 \n",
" 3 \n",
" female \n",
" NaN \n",
" 8 \n",
" 2 \n",
" 69.5500 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 201 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 8 \n",
" 2 \n",
" 69.5500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 324 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 8 \n",
" 2 \n",
" 69.5500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 159 \n",
" 0 \n",
" 3 \n",
" male \n",
" NaN \n",
" 8 \n",
" 2 \n",
" 69.5500 \n",
" S \n",
" Third \n",
" man \n",
" True \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
" 180 \n",
" 0 \n",
" 3 \n",
" female \n",
" NaN \n",
" 8 \n",
" 2 \n",
" 69.5500 \n",
" S \n",
" Third \n",
" woman \n",
" False \n",
" NaN \n",
" Southampton \n",
" no \n",
" False \n",
" \n",
" \n",
"
\n",
"
160 rows × 15 columns
\n",
"
"
],
"text/plain": [
" survived pclass sex age sibsp parch fare embarked class \\\n",
"413 0 2 male NaN 0 0 0.0000 S Second \n",
"466 0 2 male NaN 0 0 0.0000 S Second \n",
"674 0 2 male NaN 0 0 0.0000 S Second \n",
"732 0 2 male NaN 0 0 0.0000 S Second \n",
"481 0 2 male NaN 0 0 0.0000 S Second \n",
"277 0 2 male NaN 0 0 0.0000 S Second \n",
"784 0 3 male 25.00 0 0 7.0500 S Third \n",
"884 0 3 male 25.00 0 0 7.0500 S Third \n",
"773 0 3 male NaN 0 0 7.2250 C Third \n",
"26 0 3 male NaN 0 0 7.2250 C Third \n",
"522 0 3 male NaN 0 0 7.2250 C Third \n",
"354 0 3 male NaN 0 0 7.2250 C Third \n",
"598 0 3 male NaN 0 0 7.2250 C Third \n",
"832 0 3 male NaN 0 0 7.2292 C Third \n",
"568 0 3 male NaN 0 0 7.2292 C Third \n",
"531 0 3 male NaN 0 0 7.2292 C Third \n",
"859 0 3 male NaN 0 0 7.2292 C Third \n",
"524 0 3 male NaN 0 0 7.2292 C Third \n",
"320 0 3 male 22.00 0 0 7.2500 S Third \n",
"250 0 3 male NaN 0 0 7.2500 S Third \n",
"470 0 3 male NaN 0 0 7.2500 S Third \n",
"212 0 3 male 22.00 0 0 7.2500 S Third \n",
"425 0 3 male NaN 0 0 7.2500 S Third \n",
"274 1 3 female NaN 0 0 7.7500 Q Third \n",
"260 0 3 male NaN 0 0 7.7500 Q Third \n",
"368 1 3 female NaN 0 0 7.7500 Q Third \n",
"198 1 3 female NaN 0 0 7.7500 Q Third \n",
"196 0 3 male NaN 0 0 7.7500 Q Third \n",
"790 0 3 male NaN 0 0 7.7500 Q Third \n",
"428 0 3 male NaN 0 0 7.7500 Q Third \n",
".. ... ... ... ... ... ... ... ... ... \n",
"364 0 3 male NaN 1 0 15.5000 Q Third \n",
"241 1 3 female NaN 1 0 15.5000 Q Third \n",
"347 1 3 female NaN 1 0 16.1000 S Third \n",
"431 1 3 female NaN 1 0 16.1000 S Third \n",
"469 1 3 female 0.75 2 1 19.2583 C Third \n",
"644 1 3 female 0.75 2 1 19.2583 C Third \n",
"451 0 3 male NaN 1 0 19.9667 S Third \n",
"490 0 3 male NaN 1 0 19.9667 S Third \n",
"405 0 2 male 34.00 1 0 21.0000 S Second \n",
"476 0 2 male 34.00 1 0 21.0000 S Second \n",
"409 0 3 female NaN 3 1 25.4667 S Third \n",
"229 0 3 female NaN 3 1 25.4667 S Third \n",
"485 0 3 female NaN 3 1 25.4667 S Third \n",
"133 1 2 female 29.00 1 0 26.0000 S Second \n",
"53 1 2 female 29.00 1 0 26.0000 S Second \n",
"64 0 1 male NaN 0 0 27.7208 C First \n",
"295 0 1 male NaN 0 0 27.7208 C First \n",
"74 1 3 male 32.00 0 0 56.4958 S Third \n",
"838 1 3 male 32.00 0 0 56.4958 S Third \n",
"692 1 3 male NaN 0 0 56.4958 S Third \n",
"643 1 3 male NaN 0 0 56.4958 S Third \n",
"641 1 1 female 24.00 0 0 69.3000 C First \n",
"369 1 1 female 24.00 0 0 69.3000 C First \n",
"846 0 3 male NaN 8 2 69.5500 S Third \n",
"792 0 3 female NaN 8 2 69.5500 S Third \n",
"863 0 3 female NaN 8 2 69.5500 S Third \n",
"201 0 3 male NaN 8 2 69.5500 S Third \n",
"324 0 3 male NaN 8 2 69.5500 S Third \n",
"159 0 3 male NaN 8 2 69.5500 S Third \n",
"180 0 3 female NaN 8 2 69.5500 S Third \n",
"\n",
" who adult_male deck embark_town alive alone \n",
"413 man True NaN Southampton no True \n",
"466 man True NaN Southampton no True \n",
"674 man True NaN Southampton no True \n",
"732 man True NaN Southampton no True \n",
"481 man True NaN Southampton no True \n",
"277 man True NaN Southampton no True \n",
"784 man True NaN Southampton no True \n",
"884 man True NaN Southampton no True \n",
"773 man True NaN Cherbourg no True \n",
"26 man True NaN Cherbourg no True \n",
"522 man True NaN Cherbourg no True \n",
"354 man True NaN Cherbourg no True \n",
"598 man True NaN Cherbourg no True \n",
"832 man True NaN Cherbourg no True \n",
"568 man True NaN Cherbourg no True \n",
"531 man True NaN Cherbourg no True \n",
"859 man True NaN Cherbourg no True \n",
"524 man True NaN Cherbourg no True \n",
"320 man True NaN Southampton no True \n",
"250 man True NaN Southampton no True \n",
"470 man True NaN Southampton no True \n",
"212 man True NaN Southampton no True \n",
"425 man True NaN Southampton no True \n",
"274 woman False NaN Queenstown yes True \n",
"260 man True NaN Queenstown no True \n",
"368 woman False NaN Queenstown yes True \n",
"198 woman False NaN Queenstown yes True \n",
"196 man True NaN Queenstown no True \n",
"790 man True NaN Queenstown no True \n",
"428 man True NaN Queenstown no True \n",
".. ... ... ... ... ... ... \n",
"364 man True NaN Queenstown no False \n",
"241 woman False NaN Queenstown yes False \n",
"347 woman False NaN Southampton yes False \n",
"431 woman False NaN Southampton yes False \n",
"469 child False NaN Cherbourg yes False \n",
"644 child False NaN Cherbourg yes False \n",
"451 man True NaN Southampton no False \n",
"490 man True NaN Southampton no False \n",
"405 man True NaN Southampton no False \n",
"476 man True NaN Southampton no False \n",
"409 woman False NaN Southampton no False \n",
"229 woman False NaN Southampton no False \n",
"485 woman False NaN Southampton no False \n",
"133 woman False NaN Southampton yes False \n",
"53 woman False NaN Southampton yes False \n",
"64 man True NaN Cherbourg no True \n",
"295 man True NaN Cherbourg no True \n",
"74 man True NaN Southampton yes True \n",
"838 man True NaN Southampton yes True \n",
"692 man True NaN Southampton yes True \n",
"643 man True NaN Southampton yes True \n",
"641 woman False B Cherbourg yes True \n",
"369 woman False B Cherbourg yes True \n",
"846 man True NaN Southampton no False \n",
"792 woman False NaN Southampton no False \n",
"863 woman False NaN Southampton no False \n",
"201 man True NaN Southampton no False \n",
"324 man True NaN Southampton no False \n",
"159 man True NaN Southampton no False \n",
"180 woman False NaN Southampton no False \n",
"\n",
"[160 rows x 15 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#further EDA on data variation and dup\n",
"eda_showDup(df, 'fare')\n",
"\n",
"#df1 =df.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" column \n",
" count \n",
" unique/total \n",
" \n",
" \n",
" \n",
" \n",
" 6 \n",
" fare \n",
" 248 \n",
" 16.533333 \n",
" \n",
" \n",
" 3 \n",
" age \n",
" 88 \n",
" 5.866667 \n",
" \n",
" \n",
" 4 \n",
" sibsp \n",
" 7 \n",
" 0.466667 \n",
" \n",
" \n",
" 5 \n",
" parch \n",
" 7 \n",
" 0.466667 \n",
" \n",
" \n",
" 11 \n",
" deck \n",
" 7 \n",
" 0.466667 \n",
" \n",
" \n",
" 1 \n",
" pclass \n",
" 3 \n",
" 0.200000 \n",
" \n",
" \n",
" 7 \n",
" embarked \n",
" 3 \n",
" 0.200000 \n",
" \n",
" \n",
" 8 \n",
" class \n",
" 3 \n",
" 0.200000 \n",
" \n",
" \n",
" 9 \n",
" who \n",
" 3 \n",
" 0.200000 \n",
" \n",
" \n",
" 12 \n",
" embark_town \n",
" 3 \n",
" 0.200000 \n",
" \n",
" \n",
" 0 \n",
" survived \n",
" 2 \n",
" 0.133333 \n",
" \n",
" \n",
" 2 \n",
" sex \n",
" 2 \n",
" 0.133333 \n",
" \n",
" \n",
" 10 \n",
" adult_male \n",
" 2 \n",
" 0.133333 \n",
" \n",
" \n",
" 13 \n",
" alive \n",
" 2 \n",
" 0.133333 \n",
" \n",
" \n",
" 14 \n",
" alone \n",
" 2 \n",
" 0.133333 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" column count unique/total\n",
"6 fare 248 16.533333\n",
"3 age 88 5.866667\n",
"4 sibsp 7 0.466667\n",
"5 parch 7 0.466667\n",
"11 deck 7 0.466667\n",
"1 pclass 3 0.200000\n",
"7 embarked 3 0.200000\n",
"8 class 3 0.200000\n",
"9 who 3 0.200000\n",
"12 embark_town 3 0.200000\n",
"0 survived 2 0.133333\n",
"2 sex 2 0.133333\n",
"10 adult_male 2 0.133333\n",
"13 alive 2 0.133333\n",
"14 alone 2 0.133333"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#check data variation\n",
"eda_feature_variance_ratio (df)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Total \n",
" Percent \n",
" \n",
" \n",
" \n",
" \n",
" deck \n",
" 688 \n",
" 0.772166 \n",
" \n",
" \n",
" age \n",
" 177 \n",
" 0.198653 \n",
" \n",
" \n",
" embark_town \n",
" 2 \n",
" 0.002245 \n",
" \n",
" \n",
" embarked \n",
" 2 \n",
" 0.002245 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Total Percent\n",
"deck 688 0.772166\n",
"age 177 0.198653\n",
"embark_town 2 0.002245\n",
"embarked 2 0.002245"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#3 further EDA on data missing or inf (inlier)\n",
"eda_getMissingData(df)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Inf count \n",
" Inf Ratio \n",
" \n",
" \n",
" \n",
" \n",
" survived \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
" pclass \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
" age \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
" sibsp \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
" parch \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
" fare \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Inf count Inf Ratio\n",
"survived 0 0.0\n",
"pclass 0 0.0\n",
"age 0 0.0\n",
"sibsp 0 0.0\n",
"parch 0 0.0\n",
"fare 0 0.0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eda_getInfData(df) "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" company \n",
" revnue \n",
" profit \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" A \n",
" 1000.0 \n",
" -inf \n",
" \n",
" \n",
" 1 \n",
" B \n",
" inf \n",
" 200.0 \n",
" \n",
" \n",
" 2 \n",
" C \n",
" 4000.0 \n",
" 1000.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" company revnue profit\n",
"0 A 1000.0 -inf\n",
"1 B inf 200.0\n",
"2 C 4000.0 1000.0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#let us intentionaly create some inf to see it function work properly\n",
"company = ['A', 'B', 'C']\n",
"revnue = [1000, np.inf, 4000]\n",
"profit = [-np.inf, 200, 1000]\n",
"df_test = pd.DataFrame({'company':company, 'revnue':revnue, 'profit':profit})\n",
"df_test\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Inf count \n",
" Inf Ratio \n",
" \n",
" \n",
" \n",
" \n",
" revnue \n",
" 1 \n",
" 0.333333 \n",
" \n",
" \n",
" profit \n",
" 1 \n",
" 0.333333 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Inf count Inf Ratio\n",
"revnue 1 0.333333\n",
"profit 1 0.333333"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eda_getInfData(df_test)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"outlier standard is out of 1.5 standard deviation \n",
"\n",
"['pclass', 'age', 'sibsp', 'parch', 'fare']\n"
]
}
],
"source": [
"#4. outlier analysis please note: the result contains some discrete variable, use it with business sense\n",
"eda_getOutLier(df, 1.5)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#take a look at age, pclass, sibsp, parch are discrete variable \n",
"eda_getHistPlot(df,'age')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#bulk plot with age and fare\n",
"eda_getBulkPlot(df, ['age', 'fare'], eda_getHistPlot )"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAADuCAYAAAA0uwAcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEIxJREFUeJzt3X1sVfd9x/H3FygLkGYJxEEUSpzKKJ3UjnTxsiaVqjWEjXZTYVI7pepab2JCkzbi7rFsyh+JVE3pNC3N0FYJNdlcqWuTZe1AJSMlLFk1qcpqGro8QIqbleBAwXUe2pU+zPDdHz60kBh8fc25x+b3fknWuefcczgfRZCPf+fce36RmUiSyjWn6QCSpGZZBJJUOItAkgpnEUhS4SwCSSqcRSBJhbMIJKlwFoEkFc4ikKTCzWs6QCuuvPLK7O7ubjqGJM0qe/fu/U5mdk2236wogu7ubgYHB5uOIUmzSkQcamU/Lw1JUuEsAkkqnEUgSYWzCCSpcBaB1KbR0VFuu+02RkdHm44iTUutRRARfxgRT0fEUxHx2Yi4JCKuiYjHI+JgRNwfEfPrzCDVZWBggCeffJJPf/rTTUeRpqW2IoiI5cBtQG9mvgWYC9wKfBy4OzNXAS8BG+vKINVldHSUXbt2kZns2rXLUYFmtbovDc0DFkTEPGAhcBS4GXiwen8A2FBzBumCGxgY4NSpUwCcPHnSUYFmtdqKIDNfAP4aeJ7xAngF2Au8nJlj1W7DwPKJjo+ITRExGBGDIyMjdcWU2vLII48wNjb+13hsbIzdu3c3nEhqX52Xhq4A1gPXAG8AFgHvnmDXnOj4zNyWmb2Z2dvVNek3pKWOuuWWW5g3b/yL+fPmzWPt2rUNJ5LaV+eloVuA/8nMkcz8P+DzwE3A5dWlIoAVwJEaM0i16OvrY86c8X8+c+fO5cMf/nDDiaT21VkEzwNvj4iFERHAGuAZ4FHgfdU+fcD2GjNItViyZAnr1q0jIli3bh1LlixpOpLUtjrvETzO+E3hrwFPVufaBnwU+KOIGAKWAPfWlUGqU19fH29961sdDWjWi8wJL9HPKL29venTRyVpaiJib2b2Traf3yyWpMJZBJJUOItAkgpnEUhS4SwCSSqcRSC1ycdQ62JhEUht8jHUulhYBFIbfAy1LiYWgdQGH0Oti4lFILXBx1DrYmIRSG3wMdS6mFgEUht8DLUuJhaB1IYlS5Zw0003AXDjjTf6GGrNahaB1KZnn30WgG984xsNJ5GmxyKQ2jA0NMTRo0cBOHLkCENDQw0nktpX55zF10bEvjN+vhsRH4mIxRGxOyIOVssr6sog1eWOO+44a/3OO+9sJoh0AdQ5Q9mzmXldZl4HXA+cAL4AbAH2ZOYqYE+1Ls0qw8PDZ60fPny4oSTS9HXq0tAa4JuZeQhYDwxU2weADR3KIEmaQKeK4Fbgs9XrpZl5FKBaXtWhDNIFs3DhwvOuS7NJ7UUQEfOB9wL/PMXjNkXEYEQMjoyM1BNOatOJEyfOuy7NJp0YEbwb+FpmHqvWj0XEMoBqeXyigzJzW2b2ZmZvV1dXB2JKrevu7j7vujSbdKIIPsBPLwsB7AD6qtd9wPYOZJAuqNtvv/2869JsUmsRRMRCYC3w+TM23wWsjYiD1Xt31ZlBqkNPT89PRgHd3d309PQ0G0iahlqLIDNPZOaSzHzljG2jmbkmM1dVyxfrzCDV5fbbb2fRokWOBjTrzWs6gDRb9fT0sHPnzqZjSNPmIyakNjlnsS4WFoHUJucs1sXCIpDaMDo6ys6dO8lMdu7c6ahAs5pFILVhYGCAkydPAuNTVToq0GxmEUhtePjhh89a37VrV0NJpOmzCKQ2nB4NnGtdmk0sAqkNY2Nj512XZhO/R6Ap2bp1q7NxAXPmzOHUqVNnrff39zeYqHk9PT1s3ry56RhqgyMCqQ1XX331Wes+dE6zmSMCTYm/8f3UzTffzKlTp7j00ku57777mo4jtc0RgdSm06MC5yvWbGcRSG267LLLWL16Nddff33TUaRpsQgkqXAWgSQVziKQpMLVPUPZ5RHxYEQciIj9EXFjRCyOiN0RcbBaXlFnBknS+dU9IrgH2JWZbwZWA/uBLcCezFwF7KnWJUkNqa0IIuIy4J3AvQCZ+ePMfBlYDwxUuw0AG+rKIEmaXJ0jgjcBI8A/RMQTEfGpiFgELM3MowDV8qqJDo6ITRExGBGDIyMjNcaUpLLVWQTzgF8APpmZbwO+zxQuA2Xmtszszczerq6uujJKUvHqLIJhYDgzH6/WH2S8GI5FxDKAanm8xgySpEnUVgSZ+W3gcERcW21aAzwD7AD6qm19wPa6MkiSJlf3Q+c2A5+JiPnAc8DvMF4+D0TERuB54P01Z5AknUetRZCZ+4DeCd5aU+d5JUmt85vFklQ4i0CSCmcRSFLhLAJJKpxFIEmFswgkqXAWgSQVziKQpMJZBJJUOItAkgpnEUhS4SwCSSqcRSBJhbMIJKlwFoEkFc4ikKTC1ToxTUR8C/gecBIYy8zeiFgM3A90A98CfjMzX6ozhyTp3DoxInhXZl6XmadnKtsC7MnMVcCeal2S1JAmLg2tBwaq1wPAhgYySJIqdRdBAl+KiL0RsanatjQzjwJUy6smOjAiNkXEYEQMjoyM1BxTkspV6z0C4B2ZeSQirgJ2R8SBVg/MzG3ANoDe3t6sK6Akla7WEUFmHqmWx4EvADcAxyJiGUC1PF5nBknS+dVWBBGxKCJef/o18CvAU8AOoK/arQ/YXlcGSdLk6rw0tBT4QkScPs8/ZeauiPgq8EBEbASeB95fYwZJ0iRqK4LMfA5YPcH2UWBNXeeVJE2N3yyWpMJZBJJUOItAkgpnEUhS4SwCSSqcRSBJhbMIJKlwFoEkFc4ikKTCWQSSVDiLQJIKZxFIUuGmVATV46QlSReRloogIm6KiGeA/dX66oj4+1qTSZI6otURwd3ArwKjAJn5deCddYWSJHVOy5eGMvPwqzadvMBZJEkNaLUIDkfETUBGxPyI+BOqy0STiYi5EfFERHyxWr8mIh6PiIMRcX9EzG8zuyTpAmi1CH4P+H1gOTAMXFett6Kfs0vj48DdmbkKeAnY2OKfI0mqQUtFkJnfycwPZubSzLwqM3+rmnLyvCJiBfBrwKeq9QBuBh6sdhkANrQXXZJ0IbQ0Z3FE/O0Em18BBjNz+3kO/QTwZ8Drq/UlwMuZOVatDzM+ypjonJuATQArV65sJaYkqQ2tXhq6hPHLQQern58HFgMbI+ITEx0QEb8OHM/MvWdunmDXnOj4zNyWmb2Z2dvV1dViTEnSVLU0IgB6gJtP/yYfEZ8EvgSsBZ48xzHvAN4bEe9hvEguY3yEcHlEzKv+rBXAkWnklyRNU6sjguXAmd8qXgS8ITNPAj+a6IDM/PPMXJGZ3cCtwL9n5geBR4H3Vbv1Aee7tCRJqlmrI4K/AvZFxGOMX955J/CX1SMnHpniOT8KfC4iPgY8Adw7xeM7buvWrQwNDTUdQzPM6b8T/f39DSfRTNPT08PmzZubjtGyloogM++NiH8DPgQcYPyy0HBmfh/40xaOfwx4rHr9HHBDm3kbMTQ0xL6n9nNy4eKmo2gGmfPj8dtbe5871nASzSRzT7zYdIQpa/VTQ7/L+PcBVgD7gLcDX2H8o6BFOLlwMT9483uajiFphltw4KGmI0xZq/cI+oFfBA5l5ruAtwEjtaWSJHVMq0Xww8z8IUBE/ExmHgCurS+WJKlTWr1ZPBwRlwP/CuyOiJfwY5+SdFFo9Wbxb1Qv74iIR4GfBXbVlkqS1DGtjgh+IjP/o44gkqRmOGexJBXOIpCkwlkEklQ4i0CSCmcRSFLhLAJJKpxFIEmFswgkqXAWgSQVziKQpMLVVgQRcUlE/FdEfD0ino6IO6vt10TE4xFxMCLuj4j5dWWQJE2uzhHBjxif8H41cB2wLiLeDnwcuDszVwEvARtrzCBJmkRtRZDj/rdafV31k4zPavZgtX0A2FBXBknS5Gq9RxARcyNiH3Ac2A18E3g5M8eqXYaB5ec4dlNEDEbE4MiIk6FJUl1qLYLMPJmZ1zE+1/ENwM9NtNs5jt2Wmb2Z2dvV1VVnTEkqWkc+NZSZLwOPMT7p/eURcXoehBU405kkNarOTw11VdNbEhELgFuA/cCjwPuq3fqA7XVlkCRNbsozlE3BMmAgIuYyXjgPZOYXI+IZ4HMR8THgCeDeGjNcEC+88AJzT7zCggMPNR1F0gw398QoL7wwNvmOM0htRZCZ/w28bYLtzzF+v0CSNAPUOSK4aCxfvpxv/2geP3jze5qOImmGW3DgIZYvX9p0jCnxEROSVDiLQJIKZxFIUuEsAkkqnEUgSYWzCCSpcBaBJBXOIpCkwlkEklQ4i0CSCmcRSFLhLAJJKpxFIEmFswgkqXAWgSQVrs6pKt8YEY9GxP6IeDoi+qvtiyNid0QcrJZX1JVBkjS5OiemGQP+ODO/FhGvB/ZGxG7gt4E9mXlXRGwBtgAfrTHHBTH3xItOVamzzPnhdwE4dcllDSfRTDL3xIvA7JqYps6pKo8CR6vX34uI/cByYD3wy9VuA8BjzPAi6OnpaTqCZqChoe8B0POm2fWPXnVbOuv+nxGZWf9JIrqBLwNvAZ7PzMvPeO+lzHzN5aGI2ARsAli5cuX1hw4dqj2nNBX9/f0A3HPPPQ0nkSYWEXszs3ey/Wq/WRwRlwL/AnwkM7/b6nGZuS0zezOzt6urq76AklS4WosgIl7HeAl8JjM/X20+FhHLqveXAcfrzCBJOr86PzUUwL3A/sz8mzPe2gH0Va/7gO11ZZAkTa7OTw29A/gQ8GRE7Ku2/QVwF/BARGwEngfeX2MGSdIk6vzU0H8CcY6319R1XknS1PjNYkkqnEUgSYWzCCSpcBaBJBXOIpCkwlkEklQ4i0CSCmcRSFLhLAJJKpxFIEmFswgkqXAWgSQVziKQpMJZBJJUOItAkgpX5wxl90XE8Yh46oxtiyNid0QcrJavmbRektRZdY4I/hFY96ptW4A9mbkK2FOtS5IaVFsRZOaXgRdftXk9MFC9HgA21HV+SVJrOn2PYGlmHgWolld1+PySpFeZsTeLI2JTRAxGxODIyEjTcSTpotXpIjgWEcsAquXxc+2Ymdsyszcze7u6ujoWUJJK0+ki2AH0Va/7gO0dPr8k6VXq/PjoZ4GvANdGxHBEbATuAtZGxEFgbbUuSWrQvLr+4Mz8wDneWlPXOSVJUzdjbxZLkjrDIpCkwlkEklQ4i0CSCmcRSFLhLAJJKpxFIEmFswgkqXAWgSQVziKQpMJZBJJUOItAkgpnEUhS4SwCSSqcRSBJhbMIJKlwjRRBRKyLiGcjYigitjSRQZI0rrYZys4lIuYCf8f4VJXDwFcjYkdmPtPpLJq6rVu3MjQ01HSMGeH0f4f+/v6Gk8wMPT09bN68uekYakMTI4IbgKHMfC4zfwx8DljfQA5pWhYsWMCCBQuajiFNW8dHBMBy4PAZ68PAL716p4jYBGwCWLlyZWeSaVL+xiddfJoYEcQE2/I1GzK3ZWZvZvZ2dXV1IJYklamJIhgG3njG+grgSAM5JEk0UwRfBVZFxDURMR+4FdjRQA5JEg3cI8jMsYj4A+BhYC5wX2Y+3ekckqRxTdwsJjMfAh5q4tySpLP5zWJJKpxFIEmFswgkqXCR+ZqP8M84ETECHGo6hzSBK4HvNB1COoerM3PSL2LNiiKQZqqIGMzM3qZzSNPhpSFJKpxFIEmFswik6dnWdABpurxHIEmFc0QgSYWzCCSpcBaBJBXOIpCkwlkEklS4/weEsXPoDDFljwAAAABJRU5ErkJggg==\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAADuCAYAAAAjmZDVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEE9JREFUeJzt3X+MXWWdx/H3tx3UVlwqMhDSooNOs2pi/MGEkLh/wAAbKBsLURI2ZG1Il8YNumw0WREQSlJdTciyhSBsAaG47CJx19C4RBdazGb/0HW6svgDs9wlVYYSGAVB6BYz8N0/5kyZaZ/OTHvvmXMPfb+Sm3u+z3lm+v2j6afPOec+NzITSZL2t6TpBiRJ/cmAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKlooOkGunHcccfl0NBQ021IUqvs3Lnz15k5ON+8VgfE0NAQY2NjTbchSa0SEb9cyDwvMUmSigwISVKRASFJKjIgJElFBoTUY2NjY4yOjrJz586mW5G6YkBIPbZx40Zee+01rr322qZbkbpiQEg9NDY2xksvvQTASy+95CpCrWZASD20cePGWbWrCLWZASH10PTq4WC11Ca1BkRE7IqIn0TEIxExVo0dGxEPRsTj1fvbq/GIiBsjohMRj0bER+rsTarD0UcfPWcttclirCDOyMwPZeZIVV8BbM/M1cD2qgY4F1hdvTYAtyxCb1JP7X+J6brrrmumEakHmrjEtBbYWh1vBc6fMX53TvkBsCIiTmygP+mwjYyM7Fs1HH300ZxyyikNdyQdvroDIoF/i4idEbGhGjshM58GqN6Pr8ZXAk/O+NnxakxqlY0bN7JkyRJXD2q9undz/Whm7o6I44EHI+IXc8yNwlgeMGkqaDYAvPOd7+xNl1IPjYyMsGPHjqbbkLpW6woiM3dX788C3wZOBZ6ZvnRUvT9bTR8HTprx46uA3YXfuSUzRzJzZHBw3u3MJUmHqbaAiIi3RsTbpo+BPwZ+CmwD1lXT1gH3V8fbgE9WTzOdBrwwfSlKkrT46rzEdALw7YiY/nP+MTO/GxE/Au6LiPXAr4ALq/kPAGuADrAHuKTG3iRJ86gtIDLzCeCDhfHfAGcWxhO4rK5+JEmHxk9SS5KKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBXVHhARsTQifhwR36nqkyPihxHxeER8MyLeVI2/uao71fmhunuTJB3cYqwgLgcem1F/FbghM1cDzwPrq/H1wPOZOQzcUM2TJDWk1oCIiFXAecDtVR3AKPCtaspW4PzqeG1VU50/s5ovSWpA3SuIvwP+Gnitqt8B/DYzJ6t6HFhZHa8EngSozr9QzZ8lIjZExFhEjE1MTNTZuyQd0WoLiIj4E+DZzNw5c7gwNRdw7vWBzC2ZOZKZI4ODgz3oVJJUMlDj7/4o8LGIWAO8BfgDplYUKyJioFolrAJ2V/PHgZOA8YgYAI4BnquxP0nSHGpbQWTmFzJzVWYOARcBOzLzYuBh4BPVtHXA/dXxtqqmOr8jMw9YQUiSFkcTn4P4PPDZiOgwdY/hjmr8DuAd1fhngSsa6E2SVKnzEtM+mfl94PvV8RPAqYU5e4ELF6MfSdL8/CS1JKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpEBIUkqMiAkSUUGhCSpyICQJBUZEJKkIgNCklRkQEiSigwISVKRASFJKjIgJElFBoQkqciAkCQVGRCSpCIDQpJUZEBIkooMCElSkQEhSSoyICRJRQaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUlFtARERb4mI/4yI/46In0XEddX4yRHxw4h4PCK+GRFvqsbfXNWd6vxQXb1JkuZX5wriFWA0Mz8IfAg4JyJOA74K3JCZq4HngfXV/PXA85k5DNxQzZMkNaS2gMgpL1XlUdUrgVHgW9X4VuD86nhtVVOdPzMioq7+JElzq/UeREQsjYhHgGeBB4H/BX6bmZPVlHFgZXW8EngSoDr/AvCOwu/cEBFjETE2MTFRZ/uSdESrNSAy89XM/BCwCjgVeF9pWvVeWi3kAQOZWzJzJDNHBgcHe9esJGmWRXmKKTN/C3wfOA1YERED1alVwO7qeBw4CaA6fwzw3GL0J0k6UJ1PMQ1GxIrqeBlwFvAY8DDwiWraOuD+6nhbVVOd35GZB6wgJEmLY2D+KYftRGBrRCxlKojuy8zvRMTPgXsjYhPwY+COav4dwDciosPUyuGiGnuTJM2jtoDIzEeBDxfGn2DqfsT+43uBC+vqR5J0aPwktSSpyICQJBUZEJKkogUFRET8UURcUh0PRsTJ9bYlSWravAEREdcCnwe+UA0dBfxDnU1JbXbPPfdw+umnc++99zbditSVhawgLgA+BrwMkJm7gbfV2ZTUZrfddhsAt956a8OdSN1ZSED8vvrAWgJExFvrbUlqr3vuuWdW7SpCbbaQgLgvIv6eqS0yLgUeAm6rty2pnaZXD9NcRajN5v2gXGZeHxFnAy8Cfwhck5kP1t6ZJKlRcwZEtU3G9zLzLKa265YkHSHmvMSUma8CeyLimEXqR2q1Sy+9dFb9qU99qqFOpO4t5B7EXuAnEXFHRNw4/aq7MamNLr744ln1RRe556TaayGb9f1r9ZI0j06nc0A9PDzcUDdSdxZyk3rrfHMkTdm0adMB9V133dVMM1KX5g2IiFgN/A3wfuAt0+OZ+e4a+5JaadeuXXPWUpss5B7EncAtwCRwBnA38I06m5LaamhoaM5aapOFBMSyzNwORGb+MjM3AqP1tiW109VXXz1nLbXJgp5iioglwOMR8emIuAA4vua+pFYaHh7et2oYGhryBrVa7aABERHTl5HuB5YDfwmcAvwZsK7+1qR2OvvsswE455xzGu5E6s5cK4hTIuJdwMVMbfG9B/gc8OfA/yxCb1Ir3XnnnQDcfvvtDXcidWeugLgV+C7wXmBn9Rqb8S5pPzt27GBychKAyclJHn744YY7kg5fTO3kPceEiFsy8y8WqZ9DMjIykmNjZpX6x1lnnbUvIAAGBgZ46KGHGuxIOlBE7MzMkfnmzXuTul/DQepHM8OhVEttsqDvpJa0MBExZy21iQEh9dBRRx01Zy21iQEh9dD+j7aee+65DXUidc+AkHroPe95z6zaD8qpzQwIqYduuummWfXmzZsb6kTqngEh9ZBPMemNpLaAiIiTIuLhiHgsIn4WEZdX48dGxIMR8Xj1/vZqPKpvq+tExKMR8ZG6epPqMjAwMGcttUmdK4hJ4HOZ+T7gNOCyiHg/cAWwPTNXA9urGuBcYHX12sDUFuNSq1x55ZWz6quuuqqhTqTu1RYQmfl0Zv5Xdfw74DFgJbAWmP6Wuq3A+dXxWuDunPIDYEVEnFhXf1IdRkdH960aBgYGOOOMMxruSDp8i3IPIiKGgA8DPwROyMynYSpEeH3r8JXAkzN+bLwa2/93bYiIsYgYm5iYqLNt6bBMryJcPajtar9AGhFHA/8M/FVmvjjHJ0tLJw7YKCoztwBbYGovpl71KfXK6Ogoo6N+p5bar9YVREQcxVQ43JOZ/1INPzN96ah6f7YaHwdOmvHjq4DddfYnSTq4Op9iCuAO4LHM/NsZp7bx+hcOrWPqC4mmxz9ZPc10GvDC9KUoSdLiq/MS00eZ+va5n0TEI9XYlcBXgPsiYj3wK+DC6twDwBqgw9SXE11SY2+SpHnUFhCZ+R+U7ysAnFmYn8BldfUjSTo0fpJaklRkQEiSigwISVKRASFJKjIgpB7rdDqcd955dDqdpluRumJASD22adMmXn75ZTZt2tR0K1JXDAiphzqdDrt27QJg165driLUagaE1EP7rxpcRajNDAiph6ZXDwerpTYxIKQeGhoamrOW2sSAkHroggsumFV//OMfb6gTqXsGhNRDX/va12bVN998c0OdSN0zIKQeeuWVV2bVe/fubagTqXsGhCSpyICQJBUZEJKkIgNC6iEfc9UbiQEh9ZCPueqNxICQesjHXPVGYkBIPeRjrnojMSAkSUUGhNRDS5cunbOW2sSAkHroqquumlVfffXVDXUidc+AkCQVGRBSD335y1+eVX/pS19qqBOpewaE1EOTk5Nz1lKbGBBSD3mTWm8kBoTUQ5k5Zy21iQEh9dBrr702Zy21iQEhSSqqLSAi4usR8WxE/HTG2LER8WBEPF69v70aj4i4MSI6EfFoRHykrr4kSQtT5wriLuCc/cauALZn5mpge1UDnAusrl4bgFtq7EuqzYoVK+aspTapLSAy89+B5/YbXgtsrY63AufPGL87p/wAWBERJ9bVm1SX66+/fs5aapPFvgdxQmY+DVC9H1+NrwSenDFvvBo7QERsiIixiBibmJiotVlJOpL1y03qKIwVnw/MzC2ZOZKZI4ODgzW3JR2aL37xi7Pqa665pqFOpO4tdkA8M33pqHp/thofB06aMW8VsHuRe5O69vTTT8+qd+/2r7Haa7EDYhuwrjpeB9w/Y/yT1dNMpwEvTF+KkiQ1Y6CuXxwR/wScDhwXEePAtcBXgPsiYj3wK+DCavoDwBqgA+wBLqmrL0nSwtQWEJn5pwc5dWZhbgKX1dWLJOnQ9ctNaklSnzEgJElFBoQkqciAkCQVGRCSpCIDQpJUVNtjrjry3HTTTXQ6nabb6DuXX3550y00anh4mM985jNNt6HD4ApCklQUbf7O3JGRkRwbG2u6DWmfNWvWsGfPnn318uXLeeCBBxrsSDpQROzMzJH55rmCkHroxhtvnLOW2sSAkHpoeHh43/Hy5ctn1VLbGBBSj61evZolS5a4elDrGRBSjy1fvpwPfOADrh7UegaEJKnIgJAkFRkQkqQiA0KSVGRASJKKDAhJUpGb9XXJDeq0v+m/D0f6Jn06UNs2LjQgutTpdHjkp4/x6vJjm25FfWLJ76f2N9v5xDMNd6J+snTPc023cMgMiB54dfmx/N971zTdhqQ+tuwX7du00XsQkqQiA0KSVGRASJKKDAhJUpE3qbv01FNPsXTPC628ASVp8Szd8xueemqy6TYOiSsISVKRAdGllStXAtF0G+ojS/a+yJK9LzbdhvpOVP9etEdfXWKKiHOAzcBS4PbM/ErDLc3LL4XR/jqd3wEw/O4TGu5E/eWE1v170TcBERFLgZuBs4Fx4EcRsS0zf95sZ3Nr08fmtTimt9jYvHlzw51I3embgABOBTqZ+QRARNwLrAX6OiD0OvelmuJeTLO1bf8hva6f7kGsBJ6cUY9XY7NExIaIGIuIsYmJiUVrTlqoZcuWsWzZsqbbkLrWTyuI0p3ePGAgcwuwBWBkZOSA82qO/0uU3lj6aQUxDpw0o14F7G6oF0k64vVTQPwIWB0RJ0fEm4CLgG0N9yRJR6y+ucSUmZMR8Wnge0w95vr1zPxZw21J0hGrbwICIDMfANyzQpL6QD9dYpIk9REDQpJUZEBIkooMCElSUWS297NmETEB/LLpPqSC44BfN92EdBDvyszB+Sa1OiCkfhURY5k50nQfUje8xCRJKjIgJElFBoRUjy1NNyB1y3sQkqQiVxCSpCIDQpJUZEBIkooMCElSkQEhSSr6fyXrCvn8Aw6uAAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# use boxplot to show outlier\n",
"eda_getBulkPlot(df, ['age', 'fare'], eda_getOutlierBoxPlot)\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\nhandle outlier\\nthis will depend on your business senario. You can do following \\n* remove outlier\\n* cap outlier\\n* replace with mean, median\\n* future analyze with target variable to see if it is consistent with other value to determine next step\\n https://www.kaggle.com/code/pmarcelino/comprehensive-data-exploration-with-python/notebook\\n\\n'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"handle outlier\n",
"this will depend on your business senario. You can do following \n",
"* remove outlier\n",
"* cap outlier\n",
"* replace with mean, median\n",
"* future analyze with target variable to see if it is consistent with other value to determine next step\n",
" https://www.kaggle.com/code/pmarcelino/comprehensive-data-exploration-with-python/notebook\n",
"\n",
"'''\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" survived \n",
" pclass \n",
" age \n",
" sibsp \n",
" parch \n",
" fare \n",
" adult_male \n",
" alone \n",
" \n",
" \n",
" \n",
" \n",
" survived \n",
" 1.000000 \n",
" -0.338481 \n",
" -0.077221 \n",
" -0.035322 \n",
" 0.081629 \n",
" 0.257307 \n",
" -0.557080 \n",
" -0.203367 \n",
" \n",
" \n",
" pclass \n",
" -0.338481 \n",
" 1.000000 \n",
" -0.369226 \n",
" 0.083081 \n",
" 0.018443 \n",
" -0.549500 \n",
" 0.094035 \n",
" 0.135207 \n",
" \n",
" \n",
" age \n",
" -0.077221 \n",
" -0.369226 \n",
" 1.000000 \n",
" -0.308247 \n",
" -0.189119 \n",
" 0.096067 \n",
" 0.280328 \n",
" 0.198270 \n",
" \n",
" \n",
" sibsp \n",
" -0.035322 \n",
" 0.083081 \n",
" -0.308247 \n",
" 1.000000 \n",
" 0.414838 \n",
" 0.159651 \n",
" -0.253586 \n",
" -0.584471 \n",
" \n",
" \n",
" parch \n",
" 0.081629 \n",
" 0.018443 \n",
" -0.189119 \n",
" 0.414838 \n",
" 1.000000 \n",
" 0.216225 \n",
" -0.349943 \n",
" -0.583398 \n",
" \n",
" \n",
" fare \n",
" 0.257307 \n",
" -0.549500 \n",
" 0.096067 \n",
" 0.159651 \n",
" 0.216225 \n",
" 1.000000 \n",
" -0.182024 \n",
" -0.271832 \n",
" \n",
" \n",
" adult_male \n",
" -0.557080 \n",
" 0.094035 \n",
" 0.280328 \n",
" -0.253586 \n",
" -0.349943 \n",
" -0.182024 \n",
" 1.000000 \n",
" 0.404744 \n",
" \n",
" \n",
" alone \n",
" -0.203367 \n",
" 0.135207 \n",
" 0.198270 \n",
" -0.584471 \n",
" -0.583398 \n",
" -0.271832 \n",
" 0.404744 \n",
" 1.000000 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived pclass age sibsp parch fare \\\n",
"survived 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307 \n",
"pclass -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500 \n",
"age -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067 \n",
"sibsp -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651 \n",
"parch 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225 \n",
"fare 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000 \n",
"adult_male -0.557080 0.094035 0.280328 -0.253586 -0.349943 -0.182024 \n",
"alone -0.203367 0.135207 0.198270 -0.584471 -0.583398 -0.271832 \n",
"\n",
" adult_male alone \n",
"survived -0.557080 -0.203367 \n",
"pclass 0.094035 0.135207 \n",
"age 0.280328 0.198270 \n",
"sibsp -0.253586 -0.584471 \n",
"parch -0.349943 -0.583398 \n",
"fare -0.182024 -0.271832 \n",
"adult_male 1.000000 0.404744 \n",
"alone 0.404744 1.000000 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#5. feature correlation only show numberic varialbe, no categorical\n",
"df.corr()\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"survived 1.000000\n",
"fare 0.257307\n",
"parch 0.081629\n",
"sibsp -0.035322\n",
"age -0.077221\n",
"alone -0.203367\n",
"pclass -0.338481\n",
"adult_male -0.557080\n",
"Name: survived, dtype: float64\n"
]
}
],
"source": [
"#custom function to list target column in descending order, fare play big part for survival\n",
"eda_getCorrelation (df, 'survived')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#take close look using pairplot\n",
"eda_getPairPlot(df,['survived','age', 'fare'])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#using heat map to show some collinearity \n",
"eda_getCorrlationHeatMap (df)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"eda_getTopKCorrelatedColumnHeatMap (df, 5, 'survived' )\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 num__fare\n",
"2 cat__x0_female\n",
"3 cat__x0_male\n",
"4 cat__x1_C\n",
"6 cat__x1_S\n",
"8 cat__x2_child\n",
"9 cat__x2_man\n",
"10 cat__x2_woman\n",
"11 cat__x3_Cherbourg\n",
"13 cat__x3_Southampton\n",
"dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#6. select K best features\n",
"#classification use f_classif, regression use f_regression\n",
"eda_getKBestFeatures(df, 10,['age', 'fare'], ['sex', 'embarked', 'who', 'embark_town'], 'survived', f_classif )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}