SharedWine Quality Project.sagewsOpen in CoCalc
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

white = pd.read_csv('Wine_data.csv')

columns = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
white.columns = columns

cat_quality = pd.DataFrame(white['quality'])

for i in range(len(cat_quality)):
    if cat_quality.iloc[i][0] in [3, 4, 5]:
        cat_quality.iloc[i] = 0
    elif cat_quality.iloc[i][0] in [6]:
        cat_quality.iloc[i]= 1
    else:
        cat_quality.iloc[i] = 2

white_cat = white.loc[:, 'fixed acidity':'alcohol']
white_cat['quality'] = cat_quality


features = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
X = pd.DataFrame(white_cat, columns= features)
y = cat_quality

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 0, stratify= y)
X_train, y_train = make_classification(n_samples=1000, n_features=11,
                            n_informative=2, n_redundant=0,
                            random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features=9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
clf.score(X_train, y_train)
list(zip(features, clf.feature_importances_))


predicted = clf.predict(X_test)
predicted = pd.DataFrame(predicted)

cm = metrics.confusion_matrix(y_test, predicted)


plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)
plt.show()
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features=9, max_leaf_nodes=None, min_impurity_decrease=0.000000000000000, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.000000000000000, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) 0.95399999999999996 [('fixed acidity', 0.18878214854068887), ('volatile acidity', 0.55876632682144667), ('citric acid', 0.045455564029277688), ('residual sugar', 0.034008117976926613), ('chlorides', 0.0), ('free sulfur dioxide', 0.054718306089931411), ('total sulfur dioxide', 0.04174303541397819), ('density', 0.01135927080158613), ('pH', 0.057211004021981246), ('sulphates', 0.0079562263041830956), ('alcohol', 0.0)]
<matplotlib.text.Text object at 0x7f7c5aebf650> <matplotlib.text.Text object at 0x7f7c5af45bd0>
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

white = pd.read_csv('Wine_data_2.csv')

columns = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
white.columns = columns

cat_quality = pd.DataFrame(white['quality'])

for i in range(len(cat_quality)):
    if cat_quality.iloc[i][0] in [3, 4, 5]:
        cat_quality.iloc[i] = 0
    elif cat_quality.iloc[i][0] in [6]:
        cat_quality.iloc[i]= 1
    else:
        cat_quality.iloc[i] = 2

white_cat = white.loc[:, 'fixed acidity':'alcohol']
white_cat['quality'] = cat_quality


features = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
X = pd.DataFrame(white_cat, columns= features)
y = cat_quality

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 0, stratify= y)
X_train, y_train = make_classification(n_samples=1000, n_features=11,
                            n_informative=2, n_redundant=0,
                            random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features=9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
clf.score(X_train, y_train)
list(zip(features, clf.feature_importances_))


predicted = clf.predict(X_test)
predicted = pd.DataFrame(predicted)

cm = metrics.confusion_matrix(y_test, predicted)


plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)
plt.show()
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features=9, max_leaf_nodes=None, min_impurity_decrease=0.000000000000000, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.000000000000000, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) 0.95399999999999996 [('fixed acidity', 0.18878214854068887), ('volatile acidity', 0.55876632682144667), ('citric acid', 0.045455564029277688), ('residual sugar', 0.034008117976926613), ('chlorides', 0.0), ('free sulfur dioxide', 0.054718306089931411), ('total sulfur dioxide', 0.04174303541397819), ('density', 0.01135927080158613), ('pH', 0.057211004021981246), ('sulphates', 0.0079562263041830956), ('alcohol', 0.0)]
<matplotlib.text.Text object at 0x7f7c5b495c50> <matplotlib.text.Text object at 0x7f7c5b470490>