In [1]:
# Import necessary libraries
import pandas as pd  # Used for data manipulation and analysis, including reading CSV files and handling structured data.
import numpy as np  # Provides support for numerical computations and handling large multidimensional arrays.
import matplotlib.pyplot as plt  # Used for data visualization, creating graphs and charts to explore data distributions.
import seaborn as sns  # Built on top of Matplotlib, provides enhanced visualization capabilities for statistical data.
from sklearn.model_selection import train_test_split  # Splits the dataset into training and testing sets for model evaluation.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  # Encodes categorical variables into numerical format for ML models.
from sklearn.ensemble import RandomForestRegressor  # Implements the Random Forest algorithm for car price prediction.
from sklearn.metrics import mean_squared_error, r2_score  # Evaluates model performance using MSE and R² metrics.

# Load Dataset
file_path = r"D:\Portfolio 2.0\Datasets\DataSet Car.csv"
df = pd.read_csv(file_path)

# Drop irrelevant column (Unnamed index)
df.drop(columns=["Unnamed: 0"], inplace=True)

# Step 1: Exploratory Data Analysis (EDA)


#  Display basic info about dataset
df.info()
print("\nSummary statistics:\n", df.describe())

# Check missing values
missing_values = df.isnull().sum()
print("\nMissing values in dataset:\n", missing_values[missing_values > 0])

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_columns)

# Display sample values before encoding
print("\nSample Values from Categorical Columns:")
print(df[categorical_columns].head())

# Convert categorical columns to numerical before correlation analysis
df_encoded = df.copy()

# Apply Label Encoding to categorical features for correlation heatmap
for col in categorical_columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

# Correlation Heatmap (After Encoding)
plt.figure(figsize=(10, 6))
sns.heatmap(df_encoded.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

# Price Distribution
plt.figure(figsize=(8, 6))
sns.histplot(df["Price"], bins=50, kde=True, color="blue")
plt.title("Car Price Distribution")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

# Boxplot of Price
plt.figure(figsize=(8, 6))
sns.boxplot(y=df["Price"])
plt.title("Box Plot of Car Prices")
plt.show()

# Step 2: Preprocessing - Encoding Categorical Features

# Apply Label Encoding for all categorical columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Force conversion to string before encoding
    label_encoders[col] = le

# Verify all categorical columns are now numeric
print("\nData Types After Encoding:")
print(df.dtypes)

# Display updated dataset sample
print("\nUpdated Dataset Sample:")
print(df.head())

# Train-Test Split
X = df.drop(columns=["Price"])  # Features
y = df["Price"]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared Score (R²): {r2:.4f}")

# Visualizations

# Feature Importance Plot
feature_importance = rf_model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]

plt.figure(figsize=(10, 6))
plt.barh(np.array(X_train.columns)[sorted_idx][:10], feature_importance[sorted_idx][:10], color="blue")
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Top 10 Important Features in Car Price Prediction")
plt.gca().invert_yaxis()
plt.show()

# Actual vs Predicted Prices (Scatter Plot)
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color="blue", alpha=0.5, label="Predicted Prices")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--', label="Ideal Prediction")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Car Prices (Random Forest)")
plt.legend()
plt.show()

# Price Distribution Before & After Prediction
plt.figure(figsize=(8, 6))
plt.hist(y_test, bins=30, alpha=0.7, color="blue", label="Actual Prices")
plt.hist(y_pred, bins=30, alpha=0.7, color="red", label="Predicted Prices")
plt.xlabel("Car Price")
plt.ylabel("Frequency")
plt.title("Car Price Distribution: Actual vs Predicted")
plt.legend()
plt.show()

# Error Distribution Plot
errors = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.histplot(errors, bins=30, kde=True, color="purple")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.title("Error Distribution (Random Forest)")
plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46022 entries, 0 to 46021
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Company Name         46022 non-null  object
 1   Model Name           46022 non-null  object
 2   Price                46022 non-null  int64 
 3   Model Year           46022 non-null  int64 
 4   Location             46022 non-null  object
 5   Mileage              46022 non-null  int64 
 6   Engine Type          46022 non-null  object
 7   Engine Capacity      46022 non-null  int64 
 8   Color                46022 non-null  object
 9   Assembly             46022 non-null  object
 10  Body Type            46022 non-null  object
 11  Transmission Type    46022 non-null  object
 12  Registration Status  46022 non-null  object
dtypes: int64(4), object(9)
memory usage: 4.6+ MB

Summary statistics:
               Price    Model Year        Mileage  Engine Capacity
count  4.602200e+04  46022.000000   46022.000000     46022.000000
mean   2.014153e+06   2011.035374   90965.128243      1313.115575
std    2.939071e+06      6.399403   63656.656034       614.690832
min    1.110000e+05   1990.000000       1.000000        16.000000
25%    8.500000e+05   2007.000000   48899.500000      1000.000000
50%    1.450000e+06   2013.000000   80000.000000      1300.000000
75%    2.300000e+06   2016.000000  120000.000000      1500.000000
max    7.750000e+07   2019.000000  999999.000000      6600.000000

Missing values in dataset:
 Series([], dtype: int64)

Categorical Columns: Index(['Company Name', 'Model Name', 'Location', 'Engine Type', 'Color',
       'Assembly', 'Body Type', 'Transmission Type', 'Registration Status'],
      dtype='object')

Sample Values from Categorical Columns:
  Company Name Model Name   Location Engine Type   Color  Assembly  Body Type  \
0       Toyota       Vitz  Islamabad      Petrol  Silver  Imported  Hatchback   
1       Toyota    Corolla        KPK      Petrol   White     Local      Sedan   
2       Suzuki       Alto        KPK      Petrol   White     Local  Hatchback   
3       Suzuki       Alto     Punjab      Petrol   White     Local  Hatchback   
4       Toyota    Corolla  Islamabad      Petrol   Black     Local      Sedan   

  Transmission Type Registration Status  
0         Automatic       Un-Registered  
1         Automatic          Registered  
2         Automatic       Un-Registered  
3            Manual          Registered  
4            Manual          Registered  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Data Types After Encoding:
Company Name           int64
Model Name             int64
Price                  int64
Model Year             int64
Location               int64
Mileage                int64
Engine Type            int64
Engine Capacity        int64
Color                  int64
Assembly               int64
Body Type              int64
Transmission Type      int64
Registration Status    int64
dtype: object

Updated Dataset Sample:
   Company Name  Model Name    Price  Model Year  Location  Mileage  \
0            28         181  2385000        2017         1     9869   
1            28          53   111000        2019         2    11111   
2            27          14  1530000        2019         2    17500   
3            27          14  1650000        2019         4     9600   
4            28          53  1435000        2010         1   120000   

   Engine Type  Engine Capacity  Color  Assembly  Body Type  \
0            2             1000     18         0          1   
1            2             1300     21         1          4   
2            2              660     21         1          1   
3            2              660     21         1          1   
4            2             1300      2         1          4   

   Transmission Type  Registration Status  
0                  0                    1  
1                  0                    0  
2                  0                    1  
3                  1                    0  
4                  1                    0  

Model Performance:
Mean Squared Error (MSE): 488380485046.61
R-squared Score (R²): 0.9451
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image