In [1]:
# Import necessary libraries
import pandas as pd # Used for data manipulation and analysis, including reading CSV files and handling structured data.
import numpy as np # Provides support for numerical computations and handling large multidimensional arrays.
import matplotlib.pyplot as plt # Used for data visualization, creating graphs and charts to explore data distributions.
import seaborn as sns # Built on top of Matplotlib, provides enhanced visualization capabilities for statistical data.
from sklearn.model_selection import train_test_split # Splits the dataset into training and testing sets for model evaluation.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # Encodes categorical variables into numerical format for ML models.
from sklearn.ensemble import RandomForestRegressor # Implements the Random Forest algorithm for car price prediction.
from sklearn.metrics import mean_squared_error, r2_score # Evaluates model performance using MSE and R² metrics.
# Load Dataset
file_path = r"D:\Portfolio 2.0\Datasets\DataSet Car.csv"
df = pd.read_csv(file_path)
# Drop irrelevant column (Unnamed index)
df.drop(columns=["Unnamed: 0"], inplace=True)
# Step 1: Exploratory Data Analysis (EDA)
# Display basic info about dataset
df.info()
print("\nSummary statistics:\n", df.describe())
# Check missing values
missing_values = df.isnull().sum()
print("\nMissing values in dataset:\n", missing_values[missing_values > 0])
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_columns)
# Display sample values before encoding
print("\nSample Values from Categorical Columns:")
print(df[categorical_columns].head())
# Convert categorical columns to numerical before correlation analysis
df_encoded = df.copy()
# Apply Label Encoding to categorical features for correlation heatmap
for col in categorical_columns:
le = LabelEncoder()
df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
# Correlation Heatmap (After Encoding)
plt.figure(figsize=(10, 6))
sns.heatmap(df_encoded.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()
# Price Distribution
plt.figure(figsize=(8, 6))
sns.histplot(df["Price"], bins=50, kde=True, color="blue")
plt.title("Car Price Distribution")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()
# Boxplot of Price
plt.figure(figsize=(8, 6))
sns.boxplot(y=df["Price"])
plt.title("Box Plot of Car Prices")
plt.show()
# Step 2: Preprocessing - Encoding Categorical Features
# Apply Label Encoding for all categorical columns
label_encoders = {}
for col in categorical_columns:
le = LabelEncoder()
df[col] = le.fit_transform(df[col].astype(str)) # Force conversion to string before encoding
label_encoders[col] = le
# Verify all categorical columns are now numeric
print("\nData Types After Encoding:")
print(df.dtypes)
# Display updated dataset sample
print("\nUpdated Dataset Sample:")
print(df.head())
# Train-Test Split
X = df.drop(columns=["Price"]) # Features
y = df["Price"] # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nModel Performance:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared Score (R²): {r2:.4f}")
# Visualizations
# Feature Importance Plot
feature_importance = rf_model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]
plt.figure(figsize=(10, 6))
plt.barh(np.array(X_train.columns)[sorted_idx][:10], feature_importance[sorted_idx][:10], color="blue")
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Top 10 Important Features in Car Price Prediction")
plt.gca().invert_yaxis()
plt.show()
# Actual vs Predicted Prices (Scatter Plot)
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color="blue", alpha=0.5, label="Predicted Prices")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--', label="Ideal Prediction")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Car Prices (Random Forest)")
plt.legend()
plt.show()
# Price Distribution Before & After Prediction
plt.figure(figsize=(8, 6))
plt.hist(y_test, bins=30, alpha=0.7, color="blue", label="Actual Prices")
plt.hist(y_pred, bins=30, alpha=0.7, color="red", label="Predicted Prices")
plt.xlabel("Car Price")
plt.ylabel("Frequency")
plt.title("Car Price Distribution: Actual vs Predicted")
plt.legend()
plt.show()
# Error Distribution Plot
errors = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.histplot(errors, bins=30, kde=True, color="purple")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.title("Error Distribution (Random Forest)")
plt.show()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 46022 entries, 0 to 46021 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Company Name 46022 non-null object 1 Model Name 46022 non-null object 2 Price 46022 non-null int64 3 Model Year 46022 non-null int64 4 Location 46022 non-null object 5 Mileage 46022 non-null int64 6 Engine Type 46022 non-null object 7 Engine Capacity 46022 non-null int64 8 Color 46022 non-null object 9 Assembly 46022 non-null object 10 Body Type 46022 non-null object 11 Transmission Type 46022 non-null object 12 Registration Status 46022 non-null object dtypes: int64(4), object(9) memory usage: 4.6+ MB Summary statistics: Price Model Year Mileage Engine Capacity count 4.602200e+04 46022.000000 46022.000000 46022.000000 mean 2.014153e+06 2011.035374 90965.128243 1313.115575 std 2.939071e+06 6.399403 63656.656034 614.690832 min 1.110000e+05 1990.000000 1.000000 16.000000 25% 8.500000e+05 2007.000000 48899.500000 1000.000000 50% 1.450000e+06 2013.000000 80000.000000 1300.000000 75% 2.300000e+06 2016.000000 120000.000000 1500.000000 max 7.750000e+07 2019.000000 999999.000000 6600.000000 Missing values in dataset: Series([], dtype: int64) Categorical Columns: Index(['Company Name', 'Model Name', 'Location', 'Engine Type', 'Color', 'Assembly', 'Body Type', 'Transmission Type', 'Registration Status'], dtype='object') Sample Values from Categorical Columns: Company Name Model Name Location Engine Type Color Assembly Body Type \ 0 Toyota Vitz Islamabad Petrol Silver Imported Hatchback 1 Toyota Corolla KPK Petrol White Local Sedan 2 Suzuki Alto KPK Petrol White Local Hatchback 3 Suzuki Alto Punjab Petrol White Local Hatchback 4 Toyota Corolla Islamabad Petrol Black Local Sedan Transmission Type Registration Status 0 Automatic Un-Registered 1 Automatic Registered 2 Automatic Un-Registered 3 Manual Registered 4 Manual Registered
Data Types After Encoding: Company Name int64 Model Name int64 Price int64 Model Year int64 Location int64 Mileage int64 Engine Type int64 Engine Capacity int64 Color int64 Assembly int64 Body Type int64 Transmission Type int64 Registration Status int64 dtype: object Updated Dataset Sample: Company Name Model Name Price Model Year Location Mileage \ 0 28 181 2385000 2017 1 9869 1 28 53 111000 2019 2 11111 2 27 14 1530000 2019 2 17500 3 27 14 1650000 2019 4 9600 4 28 53 1435000 2010 1 120000 Engine Type Engine Capacity Color Assembly Body Type \ 0 2 1000 18 0 1 1 2 1300 21 1 4 2 2 660 21 1 1 3 2 660 21 1 1 4 2 1300 2 1 4 Transmission Type Registration Status 0 0 1 1 0 0 2 0 1 3 1 0 4 1 0 Model Performance: Mean Squared Error (MSE): 488380485046.61 R-squared Score (R²): 0.9451