{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.pyplot import savefig\n",
"\n",
"sns.set(style=\"white\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Useful function\n",
"\n",
"def get_highest_values(arr, n):\n",
" return np.array(arr).argsort()[-n:][::-1]\n",
"\n",
"def get_lowest_values(arr, n):\n",
" return np.array(arr).argsort()[::-1][-n:][::-1]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data_file = \"data/temp.train\"\n",
"interval = 16\n",
"\n",
"!python generate/generate_data_model_random.py --output data/temp --interval \"0, 16\" --kind svdne --metric sub_blocks_area --scenes \"A, D, G, H\" --nb_zones 16 --random 1 --percent 1.0 --step 10 --each 1 --renderer maxwell --custom temp_min_max_values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Correlation analysis between SVD features"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
" 16 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.324794 | \n",
" 0.312866 | \n",
" 0.322848 | \n",
" 0.299767 | \n",
" 0.293363 | \n",
" 0.297960 | \n",
" 0.356265 | \n",
" 0.352835 | \n",
" 0.342121 | \n",
" 0.346001 | \n",
" 0.329509 | \n",
" 0.395399 | \n",
" 0.329650 | \n",
" 0.330198 | \n",
" 0.373116 | \n",
" 0.376142 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.405203 | \n",
" 0.398725 | \n",
" 0.382701 | \n",
" 0.371753 | \n",
" 0.359809 | \n",
" 0.378521 | \n",
" 0.433581 | \n",
" 0.430025 | \n",
" 0.422838 | \n",
" 0.412620 | \n",
" 0.397897 | \n",
" 0.450624 | \n",
" 0.399871 | \n",
" 0.401700 | \n",
" 0.441800 | \n",
" 0.445671 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.323862 | \n",
" 0.311452 | \n",
" 0.321688 | \n",
" 0.299141 | \n",
" 0.292455 | \n",
" 0.297024 | \n",
" 0.354646 | \n",
" 0.351711 | \n",
" 0.342375 | \n",
" 0.344822 | \n",
" 0.329067 | \n",
" 0.395195 | \n",
" 0.329192 | \n",
" 0.329481 | \n",
" 0.372637 | \n",
" 0.375534 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.324096 | \n",
" 0.312037 | \n",
" 0.322657 | \n",
" 0.299463 | \n",
" 0.292699 | \n",
" 0.297577 | \n",
" 0.355800 | \n",
" 0.352304 | \n",
" 0.342609 | \n",
" 0.345576 | \n",
" 0.328781 | \n",
" 0.395006 | \n",
" 0.329373 | \n",
" 0.329939 | \n",
" 0.372829 | \n",
" 0.376044 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.331964 | \n",
" 0.317376 | \n",
" 0.328495 | \n",
" 0.306394 | \n",
" 0.300271 | \n",
" 0.305322 | \n",
" 0.361229 | \n",
" 0.359550 | \n",
" 0.348661 | \n",
" 0.350282 | \n",
" 0.333434 | \n",
" 0.398975 | \n",
" 0.334180 | \n",
" 0.334781 | \n",
" 0.377515 | \n",
" 0.381927 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 1 2 3 4 5 6 7 \\\n",
"0 0.324794 0.312866 0.322848 0.299767 0.293363 0.297960 0.356265 \n",
"1 0.405203 0.398725 0.382701 0.371753 0.359809 0.378521 0.433581 \n",
"2 0.323862 0.311452 0.321688 0.299141 0.292455 0.297024 0.354646 \n",
"3 0.324096 0.312037 0.322657 0.299463 0.292699 0.297577 0.355800 \n",
"4 0.331964 0.317376 0.328495 0.306394 0.300271 0.305322 0.361229 \n",
"\n",
" 8 9 10 11 12 13 14 \\\n",
"0 0.352835 0.342121 0.346001 0.329509 0.395399 0.329650 0.330198 \n",
"1 0.430025 0.422838 0.412620 0.397897 0.450624 0.399871 0.401700 \n",
"2 0.351711 0.342375 0.344822 0.329067 0.395195 0.329192 0.329481 \n",
"3 0.352304 0.342609 0.345576 0.328781 0.395006 0.329373 0.329939 \n",
"4 0.359550 0.348661 0.350282 0.333434 0.398975 0.334180 0.334781 \n",
"\n",
" 15 16 \n",
"0 0.373116 0.376142 \n",
"1 0.441800 0.445671 \n",
"2 0.372637 0.375534 \n",
"3 0.372829 0.376044 \n",
"4 0.377515 0.381927 "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(data_file, sep=';', header=None)\n",
"df = df.drop(df.columns[[0]], axis=1)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Compute the correlation matrix\n",
"corr = df[1:interval].corr()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Generate a mask for the upper triangle\n",
"mask = np.zeros_like(corr, dtype=np.bool)\n",
"mask[np.triu_indices_from(mask)] = True\n",
"\n",
"# Set up the matplotlib figure\n",
"f, ax = plt.subplots(figsize=(30, 20))\n",
"\n",
"# Generate a custom diverging colormap\n",
"cmap = sns.diverging_palette(220, 10, as_cmap=True)\n",
"\n",
"# Draw the heatmap with the mask and correct aspect ratio\n",
"sns.heatmap(corr, mask=mask, cmap=cmap,\n",
" square=True, linewidths=.5, cbar_kws={\"shrink\": .5})\n",
"savefig('corr_no_label.png')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"features_corr = []\n",
"\n",
"for id_row, row in enumerate(corr):\n",
" correlation_score = 0\n",
" for id_col, val in enumerate(corr[row]):\n",
" if id_col != id_row:\n",
" correlation_score += abs(val)\n",
"\n",
" features_corr.append(correlation_score)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([101, 100, 102, 99, 103, 104, 98, 105, 97, 96, 95, 106, 94,\n",
" 107, 93, 108, 92, 109, 91, 110])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_highest_values(features_corr, 20)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([199, 1, 0, 3, 2, 4, 5, 6, 7, 8, 9, 10, 198,\n",
" 11, 12, 197, 196, 195, 13, 193])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_lowest_values(features_corr, 20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Correlation analysis between SVD features and labels"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
" 16 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0.324794 | \n",
" 0.312866 | \n",
" 0.322848 | \n",
" 0.299767 | \n",
" 0.293363 | \n",
" 0.297960 | \n",
" 0.356265 | \n",
" 0.352835 | \n",
" 0.342121 | \n",
" 0.346001 | \n",
" 0.329509 | \n",
" 0.395399 | \n",
" 0.329650 | \n",
" 0.330198 | \n",
" 0.373116 | \n",
" 0.376142 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0.405203 | \n",
" 0.398725 | \n",
" 0.382701 | \n",
" 0.371753 | \n",
" 0.359809 | \n",
" 0.378521 | \n",
" 0.433581 | \n",
" 0.430025 | \n",
" 0.422838 | \n",
" 0.412620 | \n",
" 0.397897 | \n",
" 0.450624 | \n",
" 0.399871 | \n",
" 0.401700 | \n",
" 0.441800 | \n",
" 0.445671 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0.323862 | \n",
" 0.311452 | \n",
" 0.321688 | \n",
" 0.299141 | \n",
" 0.292455 | \n",
" 0.297024 | \n",
" 0.354646 | \n",
" 0.351711 | \n",
" 0.342375 | \n",
" 0.344822 | \n",
" 0.329067 | \n",
" 0.395195 | \n",
" 0.329192 | \n",
" 0.329481 | \n",
" 0.372637 | \n",
" 0.375534 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0.324096 | \n",
" 0.312037 | \n",
" 0.322657 | \n",
" 0.299463 | \n",
" 0.292699 | \n",
" 0.297577 | \n",
" 0.355800 | \n",
" 0.352304 | \n",
" 0.342609 | \n",
" 0.345576 | \n",
" 0.328781 | \n",
" 0.395006 | \n",
" 0.329373 | \n",
" 0.329939 | \n",
" 0.372829 | \n",
" 0.376044 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0.331964 | \n",
" 0.317376 | \n",
" 0.328495 | \n",
" 0.306394 | \n",
" 0.300271 | \n",
" 0.305322 | \n",
" 0.361229 | \n",
" 0.359550 | \n",
" 0.348661 | \n",
" 0.350282 | \n",
" 0.333434 | \n",
" 0.398975 | \n",
" 0.334180 | \n",
" 0.334781 | \n",
" 0.377515 | \n",
" 0.381927 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 \\\n",
"0 0 0.324794 0.312866 0.322848 0.299767 0.293363 0.297960 0.356265 \n",
"1 1 0.405203 0.398725 0.382701 0.371753 0.359809 0.378521 0.433581 \n",
"2 0 0.323862 0.311452 0.321688 0.299141 0.292455 0.297024 0.354646 \n",
"3 0 0.324096 0.312037 0.322657 0.299463 0.292699 0.297577 0.355800 \n",
"4 0 0.331964 0.317376 0.328495 0.306394 0.300271 0.305322 0.361229 \n",
"\n",
" 8 9 10 11 12 13 14 \\\n",
"0 0.352835 0.342121 0.346001 0.329509 0.395399 0.329650 0.330198 \n",
"1 0.430025 0.422838 0.412620 0.397897 0.450624 0.399871 0.401700 \n",
"2 0.351711 0.342375 0.344822 0.329067 0.395195 0.329192 0.329481 \n",
"3 0.352304 0.342609 0.345576 0.328781 0.395006 0.329373 0.329939 \n",
"4 0.359550 0.348661 0.350282 0.333434 0.398975 0.334180 0.334781 \n",
"\n",
" 15 16 \n",
"0 0.373116 0.376142 \n",
"1 0.441800 0.445671 \n",
"2 0.372637 0.375534 \n",
"3 0.372829 0.376044 \n",
"4 0.377515 0.381927 "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(data_file, sep=';', header=None)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# Compute the correlation matrix\n",
"corr = df.corr()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"