diff --git a/2017/07-decision-tree/resp_DecisionTree_FranciscoMatheusPinheiroNeryBarbosa.ipynb b/2017/07-decision-tree/resp_DecisionTree_FranciscoMatheusPinheiroNeryBarbosa.ipynb
new file mode 100644
index 0000000..a27c80a
--- /dev/null
+++ b/2017/07-decision-tree/resp_DecisionTree_FranciscoMatheusPinheiroNeryBarbosa.ipynb
@@ -0,0 +1,551 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Decision Tree"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Você pode baixar o dataset em https://archive.ics.uci.edu/ml/datasets/Car+Evaluation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import pandas as pd\n",
+ "import math\n",
+ "import numpy as np\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "headers = [\"buying\", \"maint\", \"doors\", \"persons\",\"lug_boot\", \"safety\", \"class\"]\n",
+ "data = pd.read_csv(\"carData.csv\", header=None, names=headers)\n",
+ "\n",
+ "data = data.take(np.random.permutation(len(data)))\n",
+ "#data.sample(frac=1).reset_index(drop=true) #shuffle (Felipe)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "No código acima, faremos a leitura do arquivo, informando que não há cabeçário (obrigatório) e a nossa coluna 6 (0-6) representa a label"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " buying | \n",
+ " maint | \n",
+ " doors | \n",
+ " persons | \n",
+ " lug_boot | \n",
+ " safety | \n",
+ " class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 583 | \n",
+ " high | \n",
+ " high | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " big | \n",
+ " med | \n",
+ " acc | \n",
+ "
\n",
+ " \n",
+ " 508 | \n",
+ " high | \n",
+ " vhigh | \n",
+ " 4 | \n",
+ " more | \n",
+ " med | \n",
+ " med | \n",
+ " unacc | \n",
+ "
\n",
+ " \n",
+ " 1098 | \n",
+ " med | \n",
+ " med | \n",
+ " 2 | \n",
+ " more | \n",
+ " small | \n",
+ " low | \n",
+ " unacc | \n",
+ "
\n",
+ " \n",
+ " 680 | \n",
+ " high | \n",
+ " med | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " med | \n",
+ " high | \n",
+ " unacc | \n",
+ "
\n",
+ " \n",
+ " 795 | \n",
+ " high | \n",
+ " low | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " med | \n",
+ " low | \n",
+ " unacc | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " buying maint doors persons lug_boot safety class\n",
+ "583 high high 3 4 big med acc\n",
+ "508 high vhigh 4 more med med unacc\n",
+ "1098 med med 2 more small low unacc\n",
+ "680 high med 3 2 med high unacc\n",
+ "795 high low 3 4 med low unacc"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "buying object\n",
+ "maint object\n",
+ "doors object\n",
+ "persons object\n",
+ "lug_boot object\n",
+ "safety object\n",
+ "class object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "O problema é que nossos dados categóricos são strings. Então precisamos converter em representantes numéricos para aplicarmos no algoritmo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " buying | \n",
+ " maint | \n",
+ " doors | \n",
+ " persons | \n",
+ " lug_boot | \n",
+ " safety | \n",
+ "
\n",
+ " \n",
+ " class | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " buying maint doors persons lug_boot safety\n",
+ "class \n",
+ "0 0 0 1 1 0 2\n",
+ "2 0 3 2 2 1 2\n",
+ "2 2 2 0 2 2 1\n",
+ "2 0 2 1 0 1 0\n",
+ "2 0 1 1 1 1 1"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "for h in headers:\n",
+ " data[h] = data[h].astype('category')\n",
+ " data[h] = data[h].cat.codes\n",
+ "\n",
+ "data.set_index(\"class\", inplace=True)\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Faremos a separação dos dados em conjunto de treino e teste"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "size = len(data)\n",
+ "trainSize = int(math.floor(size * 0.7))\n",
+ "trainData = data[:trainSize]\n",
+ "testData = data[trainSize:]\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Agora ok!\n",
+ "Vamos ao que interessa..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/mnery/miniconda3/envs/ds/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: \n",
+ ".ix is deprecated. Please use\n",
+ ".loc for label based indexing or\n",
+ ".iloc for positional indexing\n",
+ "\n",
+ "See the documentation here:\n",
+ "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n",
+ " \n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,\n",
+ " max_features=None, max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0, min_impurity_split=None,\n",
+ " min_samples_leaf=1, min_samples_split=2,\n",
+ " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n",
+ " splitter='best')"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dTree = DecisionTreeClassifier(criterion=\"entropy\")\n",
+ "dTree.fit(trainData.ix[:,0:6], trainData.index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/mnery/miniconda3/envs/ds/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: \n",
+ ".ix is deprecated. Please use\n",
+ ".loc for label based indexing or\n",
+ ".iloc for positional indexing\n",
+ "\n",
+ "See the documentation here:\n",
+ "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n",
+ " \"\"\"Entry point for launching an IPython kernel.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.96146435452793833"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dTree.predict(testData.ix[:, 0:6])\n",
+ "dTree.score(testData.ix[:, 0:6], testData.index)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "### Atividades\n",
+ "\n",
+ "1. Utilizamos a medida de Entropia como fator de decisão (medida de impureza de um nó). Teste o mesmo conjunto \n",
+ "randômico de dados para a medida Gini e compare os resultados.\n",
+ "Ref1.: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier\n",
+ "Ref2.: https://en.wikipedia.org/wiki/Decision_tree_learning\n",
+ "\n",
+ "2. Aplique Decision Tree em outro dataset (link abaixo) e analise os resultados. Procure identificar (se há) relações entre as features (correlacionadas, por exemplo) e faça testes eliminando as que você achar desnecessárias, de forma a tentar melhorar seu classificador, seja em predição ou perfomance.\n",
+ "Dataset: https://archive.ics.uci.edu/ml/datasets/Wine+Quality\n",
+ "\n",
+ "3. Execute a função abaixo para gerar a árvore que representa seu classificador (ambos utilizando entropia e gini como medidas). Analise a saida, entendendo como ela foi criada e os seus respectivos valores em relação a medida utilizada. Reflexão: seria possível construir nosso classificador apenas utilizando a estrutura condicional if-else?\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def visualize_tree(tree, feature_names):\n",
+ " \"\"\"Cria png que representa a arvore gerada.\n",
+ "\n",
+ " Args\n",
+ " ----\n",
+ " tree -- DecsisionTree.\n",
+ " feature_names -- vetor com os nomes das features.\n",
+ " \"\"\"\n",
+ " with open(\"dt.dot\", 'w') as f:\n",
+ " export_graphviz(tree, out_file=f,\n",
+ " feature_names=feature_names)\n",
+ "\n",
+ " command = [\"dot\", \"-Tpng\", \"dt.dot\", \"-o\", \"dt.png\"]\n",
+ " try:\n",
+ " subprocess.check_call(command)\n",
+ " except:\n",
+ " exit(\"Nao foi possivel gerar a arvore.\")\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rf = []\n",
+ "for i in range(1,100):\n",
+ " d_tree = RandomForestClassifier(n_estimators=i, criterion='gini')\n",
+ " d_tree.fit(trainData, trainData.index)\n",
+ " d_tree.predict(testData.iloc[:, 0:6])\n",
+ " rf.append(d_tree.score(testData, testData.index))"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.plot(rf)\n",
+ "plt.xlabel('Arv')\n",
+ "plt.ylabel('scr')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.976878612717\n",
+ "63\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(rf[np.argmax(rf)])\n",
+ "print(np.argmax(rf))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}