🌽 Crop Classification and Clustering¶

In [ ]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import math

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

EDA¶

In [ ]:
df = pd.read_csv('Crop_Recommendation.csv')
df
Out[ ]:
Nitrogen Phosphorus Potassium Temperature Humidity pH_Value Rainfall Crop
0 90 42 43 20.879744 82.002744 6.502985 202.935536 Rice
1 85 58 41 21.770462 80.319644 7.038096 226.655537 Rice
2 60 55 44 23.004459 82.320763 7.840207 263.964248 Rice
3 74 35 40 26.491096 80.158363 6.980401 242.864034 Rice
4 78 42 42 20.130175 81.604873 7.628473 262.717340 Rice
... ... ... ... ... ... ... ... ...
2195 107 34 32 26.774637 66.413269 6.780064 177.774507 Coffee
2196 99 15 27 27.417112 56.636362 6.086922 127.924610 Coffee
2197 118 33 30 24.131797 67.225123 6.362608 173.322839 Coffee
2198 117 32 34 26.272418 52.127394 6.758793 127.175293 Coffee
2199 104 18 30 23.603016 60.396475 6.779833 140.937041 Coffee

2200 rows × 8 columns

In [ ]:
features = df.columns[:-1]
features
Out[ ]:
Index(['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity',
       'pH_Value', 'Rainfall'],
      dtype='object')
In [ ]:
df.Crop.nunique()
Out[ ]:
22
In [ ]:
df.Crop.value_counts()
Out[ ]:
Crop
Rice           100
Maize          100
Jute           100
Cotton         100
Coconut        100
Papaya         100
Orange         100
Apple          100
Muskmelon      100
Watermelon     100
Grapes         100
Mango          100
Banana         100
Pomegranate    100
Lentil         100
Blackgram      100
MungBean       100
MothBeans      100
PigeonPeas     100
KidneyBeans    100
ChickPea       100
Coffee         100
Name: count, dtype: int64
In [ ]:
df.Crop.value_counts(normalize=True)
Out[ ]:
Crop
Rice           0.045455
Maize          0.045455
Jute           0.045455
Cotton         0.045455
Coconut        0.045455
Papaya         0.045455
Orange         0.045455
Apple          0.045455
Muskmelon      0.045455
Watermelon     0.045455
Grapes         0.045455
Mango          0.045455
Banana         0.045455
Pomegranate    0.045455
Lentil         0.045455
Blackgram      0.045455
MungBean       0.045455
MothBeans      0.045455
PigeonPeas     0.045455
KidneyBeans    0.045455
ChickPea       0.045455
Coffee         0.045455
Name: proportion, dtype: float64
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Nitrogen     2200 non-null   int64  
 1   Phosphorus   2200 non-null   int64  
 2   Potassium    2200 non-null   int64  
 3   Temperature  2200 non-null   float64
 4   Humidity     2200 non-null   float64
 5   pH_Value     2200 non-null   float64
 6   Rainfall     2200 non-null   float64
 7   Crop         2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB
In [ ]:
df.isna().sum()
Out[ ]:
Nitrogen       0
Phosphorus     0
Potassium      0
Temperature    0
Humidity       0
pH_Value       0
Rainfall       0
Crop           0
dtype: int64
In [ ]:
df.describe().T
Out[ ]:
count mean std min 25% 50% 75% max
Nitrogen 2200.0 50.551818 36.917334 0.000000 21.000000 37.000000 84.250000 140.000000
Phosphorus 2200.0 53.362727 32.985883 5.000000 28.000000 51.000000 68.000000 145.000000
Potassium 2200.0 48.149091 50.647931 5.000000 20.000000 32.000000 49.000000 205.000000
Temperature 2200.0 25.616244 5.063749 8.825675 22.769375 25.598693 28.561654 43.675493
Humidity 2200.0 71.481779 22.263812 14.258040 60.261953 80.473146 89.948771 99.981876
pH_Value 2200.0 6.469480 0.773938 3.504752 5.971693 6.425045 6.923643 9.935091
Rainfall 2200.0 103.463655 54.958389 20.211267 64.551686 94.867624 124.267508 298.560117

Probably a good idea to standardize these values

In [ ]:
# Correlation Matrix
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, 
            annot=True, 
            linewidths=0.5, 
            fmt= ".2f", 
            cmap="YlGnBu");
No description has been provided for this image

My Image

My Image

My Image

In [ ]:
metric_col = df.select_dtypes(include=['number']).columns

Modeling¶

In [ ]:
X = df.drop('Crop', axis=1)
y = df.Crop.values
In [ ]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
In [ ]:
# Train and Test Split
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)
In [ ]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[ ]:
((1760, 7), (440, 7), (1760,), (440,))
In [ ]:
# Put models in a dictionary
models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(), 
          "Random Forest": RandomForestClassifier(),
          "GradientBoost": GradientBoostingClassifier(),
          "GaussianNB": GaussianNB(),
          }
In [ ]:
# Create function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels assosciated with training data
    y_test : labels assosciated with test data
    """
    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores
In [ ]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores
C:\Users\tuckerd9\AppData\Roaming\Python\Python39\site-packages\sklearn\linear_model\_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[ ]:
{'KNN': 0.9568181818181818,
 'Logistic Regression': 0.9636363636363636,
 'Random Forest': 0.9931818181818182,
 'GradientBoost': 0.9818181818181818,
 'GaussianNB': 0.9954545454545455}

I could go with GaussianNB, but just choosing RandomForest model

In [ ]:
model_compare = pd.DataFrame(model_scores, index=['accuracy'])
model_compare.T.plot.bar();
No description has been provided for this image
In [ ]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_preds_rf = rf.predict(X_test)
In [ ]:
# Create a new dataframe with two columns: 'test' and 'predicted'
df_with_preds = pd.DataFrame({
    'test': y_test,
    'predicted': y_preds_rf
})
df_with_preds
Out[ ]:
test predicted
0 Muskmelon Muskmelon
1 Watermelon Watermelon
2 Papaya Papaya
3 Papaya Papaya
4 Apple Apple
... ... ...
435 Rice Rice
436 Rice Rice
437 Cotton Cotton
438 Cotton Cotton
439 PigeonPeas PigeonPeas

440 rows × 2 columns

In [ ]:
# Create a new dataframe with the predicted values as a column
df_with_preds = pd.DataFrame(X_test, columns=X_test.columns)
df_with_preds['actual'] = y_test
df_with_preds['predicted'] = y_preds_rf
df_with_preds
Out[ ]:
Nitrogen Phosphorus Potassium Temperature Humidity pH_Value Rainfall actual predicted
1451 101 17 47 29.494014 94.729813 6.185053 26.308209 Muskmelon Muskmelon
1334 98 8 51 26.179346 86.522581 6.259336 49.430510 Watermelon Watermelon
1761 59 62 49 43.360515 93.351916 6.941497 114.778071 Papaya Papaya
1735 44 60 55 34.280461 90.555616 6.825371 98.540477 Papaya Papaya
1576 30 137 200 22.914300 90.704756 5.603413 118.604465 Apple Apple
... ... ... ... ... ... ... ... ... ...
59 99 55 35 21.723831 80.238990 6.501698 277.962619 Rice Rice
71 67 45 38 22.727910 82.170688 7.300411 260.887506 Rice Rice
1908 121 47 16 23.605640 79.295731 7.723240 72.498009 Cotton Cotton
1958 116 52 19 22.942767 75.371706 6.114526 67.080226 Cotton Cotton
482 5 68 20 19.043805 33.106951 6.121667 155.370562 PigeonPeas PigeonPeas

440 rows × 9 columns

In [ ]:
def plot_confusion_matrix(y_test, predictions):
    # Plot the confusion matrix
    cf_matrix = confusion_matrix(y_test, predictions)
    fig = plt.subplots(figsize=(10, 8))
    sns.set(font_scale=1.4)
    sns.heatmap(cf_matrix, annot=True, fmt='d')
    plt.xlabel('Predicted Label', fontsize=12)
    plt.xticks(fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()
    
    # Reset font scale to default
    sns.set(font_scale=1)
In [ ]:
# Confusion matrix
print(confusion_matrix(y_test, y_preds_rf))
In [ ]:
plot_confusion_matrix(y_test, y_preds_rf)
No description has been provided for this image
In [ ]:
# Show classification report
print(classification_report(y_test, y_preds_rf))
              precision    recall  f1-score   support

       Apple       1.00      1.00      1.00        23
      Banana       1.00      1.00      1.00        21
   Blackgram       1.00      1.00      1.00        20
    ChickPea       1.00      1.00      1.00        26
     Coconut       1.00      1.00      1.00        27
      Coffee       1.00      1.00      1.00        17
      Cotton       1.00      1.00      1.00        17
      Grapes       1.00      1.00      1.00        14
        Jute       0.92      1.00      0.96        23
 KidneyBeans       1.00      1.00      1.00        20
      Lentil       0.92      1.00      0.96        11
       Maize       1.00      1.00      1.00        21
       Mango       1.00      1.00      1.00        19
   MothBeans       1.00      0.96      0.98        24
    MungBean       1.00      1.00      1.00        19
   Muskmelon       1.00      1.00      1.00        17
      Orange       1.00      1.00      1.00        14
      Papaya       1.00      1.00      1.00        23
  PigeonPeas       1.00      1.00      1.00        23
 Pomegranate       1.00      1.00      1.00        23
        Rice       1.00      0.89      0.94        19
  Watermelon       1.00      1.00      1.00        19

    accuracy                           0.99       440
   macro avg       0.99      0.99      0.99       440
weighted avg       0.99      0.99      0.99       440

In [ ]:
# Get feature importance scores
importance_scores = rf.feature_importances_

# Get feature names
feature_names = X_train.columns.tolist()
In [ ]:
# Sort feature importance scores in descending order
sorted_idx = np.argsort(importance_scores)[::-1]

# Create plot
plt.figure(figsize=(10, 8))

# Plot feature importance scores
for i in range(len(importance_scores)):
    plt.bar(i, importance_scores[sorted_idx[i]], color='#0091ea', align='center')
    plt.text(i, 
             importance_scores[sorted_idx[i]]+0.01, 
             feature_names[sorted_idx[i]], 
             horizontalalignment='center')

# Add title and labels
plt.title('Feature Importance')
plt.xlabel('Feature Index')
Out[ ]:
Text(0.5, 0, 'Feature Index')
No description has been provided for this image

Hypermeter Tuning with GridSearchCV¶

In [ ]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}
In [ ]:
grid_search = GridSearchCV(
                            estimator=rf,
                            param_grid=param_grid,
                            scoring='accuracy',
                            cv=5,
                            verbose=True,
                            )

grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Accuracy score: ", grid_search.score(X_test, y_test))
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters:  {'max_depth': 20, 'n_estimators': 50}
Accuracy score:  0.9931818181818182
  • No improvement of the original model with gridsearch
In [ ]:
# Get the best estimator
best_estimator = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_estimator.predict(X_test)

# Print the predictions
print("Predictions: ", y_pred)

Clustering¶

In [ ]:
df_cluster = df.drop('Crop', axis=1)
In [ ]:
# Not needed for this example since there are no categorical variables
df_cluster = pd.get_dummies(df_cluster, drop_first=True)
df_cluster.head(5)
Out[ ]:
Nitrogen Phosphorus Potassium Temperature Humidity pH_Value Rainfall
0 90 42 43 20.879744 82.002744 6.502985 202.935536
1 85 58 41 21.770462 80.319644 7.038096 226.655537
2 60 55 44 23.004459 82.320763 7.840207 263.964248
3 74 35 40 26.491096 80.158363 6.980401 242.864034
4 78 42 42 20.130175 81.604873 7.628473 262.717340
In [ ]:
# Finding optimal number of clusters
from sklearn.mixture import GaussianMixture
# Prepare
n_components = np.arange(1,10)

# Create GMM model
models = [GaussianMixture(n_components= n,
                          random_state = 1502).fit(df_cluster) for n in n_components]

#Plot
plt.plot(n_components,
         [m.bic(df_cluster) for m in models],
         label = 'BIC')
plt.plot(n_components,
         [m.aic(df_cluster) for m in models],
         label = 'AIC')
plt.legend()
plt.xlabel('Number of Components')
Out[ ]:
Text(0.5, 0, 'Number of Components')
No description has been provided for this image
In [ ]:
# Gaussian Mixture Model
model = GaussianMixture(n_components= 4,
                        random_state = 1502).fit(df_cluster)
In [ ]:
# Predict the cluster for each customer
cluster = pd.Series(model.predict(df_cluster))
cluster[:2]
Out[ ]:
0    1
1    1
dtype: int64
In [ ]:
# Create Cluster variable
df_cluster['cluster'] = cluster
df_cluster.head(5)
Out[ ]:
Nitrogen Phosphorus Potassium Temperature Humidity pH_Value Rainfall cluster
0 90 42 43 20.879744 82.002744 6.502985 202.935536 1
1 85 58 41 21.770462 80.319644 7.038096 226.655537 1
2 60 55 44 23.004459 82.320763 7.840207 263.964248 1
3 74 35 40 26.491096 80.158363 6.980401 242.864034 1
4 78 42 42 20.130175 81.604873 7.628473 262.717340 1

Visualizing the clustering¶

In [ ]:
from sklearn.decomposition import PCA
In [ ]:
data_pca = df_cluster.drop(['cluster'], axis=1)

3D Scatter of the data

In [ ]:
#Initiating PCA to reduce dimentions aka features to 3
pca = PCA(n_components=3)
pca.fit(data_pca)
PCA_ds = pd.DataFrame(pca.transform(data_pca), columns=(["col1","col2", "col3"]))
PCA_ds.describe().T
Out[ ]:
count mean std min 25% 50% 75% max
col1 2200.0 -2.480440e-15 58.604247 -104.182669 -35.757181 -8.465669 10.306607 180.711278
col2 2200.0 7.441320e-15 54.157657 -84.309526 -44.703712 -10.190573 44.016507 170.783030
col3 2200.0 -4.134067e-15 36.728830 -67.731895 -30.630587 -7.256216 28.947581 81.762206
In [ ]:
PCA_ds['Cluster'] = df_cluster['cluster']
PCA_ds
Out[ ]:
col1 col2 col3 Cluster
0 -59.969729 84.055388 32.450240 1
1 -64.090628 107.779037 24.381552 1
2 -75.156888 142.468675 -0.556024 1
3 -80.247626 117.340628 13.940485 1
4 -85.084925 137.343003 16.712434 1
... ... ... ... ...
2195 -64.055404 54.019513 44.868290 1
2196 -52.816986 3.172884 38.516656 1
2197 -65.984590 48.821451 55.387287 1
2198 -42.988902 7.978078 55.117203 1
2199 -55.797011 16.737268 43.669666 1

2200 rows × 4 columns

In [ ]:
# Export the plot as an HTML file
import plotly.graph_objects as go
import plotly.io as pio

# Write the scatterplot to an HTML file
pio.write_html(fig, file='scatterplot.html')

2D scatterplot of the data¶

In [ ]:
# Visualize the clusters using PCA
pca2 = PCA(n_components=2)
principal_components = pca2.fit_transform(X)
PCA_ds['PCA1'] = principal_components[:, 0]
PCA_ds['PCA2'] = principal_components[:, 1]

# Plot the PCA result
plt.figure(figsize=(10, 8))
sns.scatterplot(data=PCA_ds, x='PCA1', y='PCA2', hue='Cluster', palette='Set1', s=100)
plt.title('PCA of Crop Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()
No description has been provided for this image

Once I have narrowed down the RandomForest model that I want to utilize: Streamline the process

Combined preprocessing and gridsearchcv in one process

In [ ]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Does not apply to this dataset
# categorical_features = df.select_dtypes(exclude=['number']).columns
# categorical_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
#     ("onehot", OneHotEncoder(handle_unknown="ignore"))
# ])


# Create a pipeline for numeric features
numeric_features = df.select_dtypes(include=['number']).columns
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])

# Create a preprocessor object
preprocessor = ColumnTransformer(
    transformers=[
        # ("cat", categorical_transformer, categorical_features),
        ("num", numeric_transformer, numeric_features)
    ])

# Create a pipeline that includes the preprocessor and the model
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ("model", RandomForestClassifier())])

# Split the data
X = df.drop("Crop", axis=1)
y = df["Crop"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)
print('Baseline model:', model.score(X_test, y_test))

# Define the parameter grid
pipe_grid = {
    'model__n_estimators': [10, 50, 100, 200],
    'model__max_depth': [None, 10, 20, 30]
}

# Create a GridSearchCV object
gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2, n_jobs=-1)
gs_model.fit(X_train, y_train)

print("Best parameters:", gs_model.best_params_)
print("Best score:", gs_model.best_score_)
# Fit the model with the entire training data using the best parameters
model.set_params(**gs_model.best_params_)
model.fit(X_train, y_train)
# Print the test score
print("Test score:", model.score(X_test, y_test))
Baseline model: 0.9931818181818182
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'model__max_depth': 30, 'model__n_estimators': 200}
Best score: 0.9960227272727273
Test score: 0.9931818181818182
In [ ]:
predictions = gs_model.best_estimator_.predict(X_test)
predictions