import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import anderson
from statsmodels.tsa.seasonal import STL


# Part 1: Data Importation, Inspection & Merging 
paths = {
    'Store_A': r'D:\Portfolio 2.0\Datasets\store_a_sales_data.csv',
    'Store_B': r'D:\Portfolio 2.0\Datasets\store_b_sales_data.csv',
    'Store_C': r'D:\Portfolio 2.0\Datasets\store_c_sales_data.csv',
    'Pricing': r'D:\Portfolio 2.0\Datasets\product_pricing_data.csv'
}


# Load data
dfs = {k: pd.read_csv(v) for k, v in paths.items()}
# Clean names
for df in dfs.values():
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
# Melt sales
def melt_data(df, label):
    m = df.melt(id_vars=['date','store_name'], var_name='product', value_name='units_sold')
    m['store'] = label
    return m
sales = pd.concat([
    melt_data(dfs['Store_A'], 'Store_A'),
    melt_data(dfs['Store_B'], 'Store_B'),
    melt_data(dfs['Store_C'], 'Store_C')
], ignore_index=True)
# Merge pricing
sales['product'] = sales['product'].str.replace('product_','', regex=False).astype(int)
pricing = dfs['Pricing']
pricing['product'] = pricing['product'].str.replace('Product_','', regex=False).astype(int)
merged = sales.merge(pricing, on='product', how='left')
merged['date'] = pd.to_datetime(merged['date'])
# Data quality
print("Data Quality Checks:")
print(merged.info())
print(merged.describe())
print("Duplicates:", merged.duplicated().sum())


# Part 2: Distribution Analysis
print("\nDistribution Analysis (Anderson-Darling) & Visuals:")
for p in sorted(merged['product'].unique()):
    vals = merged.loc[merged['product']==p,'units_sold']
    res = anderson(vals)
    print(f"Product {p}: stat={res.statistic:.2f}, crit@5%={res.critical_values[2]}")
    # Histogram + KDE
    plt.figure(figsize=(6,4))
    sns.histplot(vals, kde=True)
    plt.title(f'Units Sold Distribution - Product {p}')
    plt.tight_layout(); plt.savefig(f'dist_hist_product_{p}.png'); plt.show()
    # Boxplot
    plt.figure(figsize=(6,2))
    sns.boxplot(x=vals)
    plt.title(f'Units Sold Boxplot - Product {p}')
    plt.tight_layout(); plt.savefig(f'dist_box_product_{p}.png'); plt.show()


# Part 3: Summary Statistics
print("\nSummary Statistics & Visuals:")
pivot = merged.pivot_table(index='product', columns='store', values='units_sold', aggfunc=['sum','mean']).round(2)
print(pivot)
daily = merged.groupby('date')['units_sold'].agg(['sum','mean','std']).round(2)
print(daily.head())
# Store-level
store_sales = merged.groupby('store')['units_sold'].sum()
store_profit = merged.assign(profit=(merged['selling_price']-merged['cost_price'])*merged['units_sold']).groupby('store')['profit'].sum()
print("Store sales:\n", store_sales)
print("Store profit:\n", store_profit)
# Bar plots
plt.figure(figsize=(6,4)); store_sales.plot(kind='bar'); plt.title('Total Units Sold by Store'); plt.tight_layout(); plt.savefig('bar_store_sales.png'); plt.show()
plt.figure(figsize=(6,4)); store_profit.plot(kind='bar'); plt.title('Total Profit by Store'); plt.tight_layout(); plt.savefig('bar_store_profit.png'); plt.show()
# Most/Least profitable
profit_unit = pricing.assign(profit=pricing['selling_price']-pricing['cost_price'])
most_prof = profit_unit.loc[profit_unit['profit'].idxmax(),'product']
least_prof = profit_unit.loc[profit_unit['profit'].idxmin(),'product']
print(f"Most profitable product: {most_prof}, Least profitable product: {least_prof}")


# Part 4: Profitability & Polynomial Regression + Ridge
merged['profit'] = (merged['selling_price']-merged['cost_price'])*merged['units_sold']
print("\nCumulative Profit per Product & Store:")
print(merged.groupby(['product','store'])['profit'].sum())

def add_features(df):
    df = df.copy()
    df['day_num'] = (df['date']-df['date'].min()).dt.days
    df['month_sin'] = np.sin(2*np.pi*df['date'].dt.month/12)
    df['month_cos'] = np.cos(2*np.pi*df['date'].dt.month/12)
    df['dow_sin'] = np.sin(2*np.pi*df['date'].dt.dayofweek/7)
    df['dow_cos'] = np.cos(2*np.pi*df['date'].dt.dayofweek/7)
    df['lag1'] = df['units_sold'].shift(1).bfill()
    df['lag7'] = df['units_sold'].shift(7).bfill()
    df['rolling7'] = df['units_sold'].rolling(7,min_periods=1).mean()
    return df

results = {}
for p in sorted(merged['product'].unique()):
    dfp = merged[merged['product']==p].groupby('date')['units_sold'].sum().reset_index()
    dfp = add_features(dfp)
    dfp['y_log'] = np.log1p(dfp['units_sold'])
    feats = ['day_num','month_sin','month_cos','dow_sin','dow_cos','lag1','lag7','rolling7']
    X, y = dfp[feats], dfp['y_log']
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)

    best_r2, best_deg = -np.inf, 2
    best_rmse, best_mae = None, None

    for deg in range(2,7):
        poly = PolynomialFeatures(degree=deg, include_bias=False)
        Xp = poly.fit_transform(Xs)
        Xtr, Xte, ytr, yte = train_test_split(Xp, y, test_size=0.2, random_state=42)
        model = Ridge(alpha=1.0).fit(Xtr, ytr)
        ypred = np.expm1(model.predict(Xte))
        ytrue = np.expm1(yte)
        r2 = r2_score(ytrue, ypred)
        rmse = np.sqrt(mean_squared_error(ytrue, ypred))
        mae  = mean_absolute_error(ytrue, ypred)
        if r2 > best_r2:
            best_r2, best_deg = r2, deg
            best_rmse, best_mae = rmse, mae

    # Fit final on full
    poly_final = PolynomialFeatures(degree=best_deg, include_bias=False)
    Xp_full = poly_final.fit_transform(Xs)
    model_final = Ridge(alpha=1.0).fit(Xp_full, y)
    preds_full = np.expm1(model_final.predict(Xp_full))
    results[p] = {'r2': best_r2, 'deg': best_deg, 'pred_full': preds_full, 'actual': np.expm1(y)}

    print(f"Product {p}: degree={best_deg}, R²={best_r2:.3f}, RMSE={best_rmse:.2f}, MAE={best_mae:.2f}")

    # Forecast
    resid_std = (results[p]['actual'] - preds_full).std()
    last = dfp.iloc[-1]
    future = pd.DataFrame({'date': pd.date_range(last['date']+pd.Timedelta(days=1), periods=30)})
    future = add_features(future.assign(units_sold=last['units_sold']))
    Xf = scaler.transform(future[feats])
    Xpf = poly_final.transform(Xf)
    future['forecast'] = np.expm1(model_final.predict(Xpf))
    future['lower'] = future['forecast'] - 1.96*resid_std
    future['upper'] = future['forecast'] + 1.96*resid_std

    # Plot forecast
    plt.figure(figsize=(8,4))
    plt.plot(dfp['date'], dfp['units_sold'], label='Actual')
    plt.plot(future['date'], future['forecast'], '--', label='Forecast')
    plt.fill_between(future['date'], future['lower'], future['upper'], color='gray', alpha=0.2)
    plt.title(f'Product {p} Sales Forecast')
    plt.xlabel('Date'); plt.ylabel('Units Sold'); plt.legend(); plt.tight_layout()
    plt.savefig(f'forecast_interval_product_{p}.png'); plt.show()

    # Diagnostic: Actual vs Predicted
    plt.figure(figsize=(6,6))
    plt.scatter(results[p]['actual'], results[p]['pred_full'], alpha=0.5)
    lims = [min(results[p]['actual'].min(), results[p]['pred_full'].min()),
            max(results[p]['actual'].max(), results[p]['pred_full'].max())]
    plt.plot(lims, lims, 'r--')
    plt.xlabel('Actual Units Sold'); plt.ylabel('Predicted Units Sold')
    plt.title(f'Actual vs Predicted - Product {p}'); plt.tight_layout()
    plt.savefig(f'scatter_actual_pred_product_{p}.png'); plt.show()


# Diagnostics: Profit Time Series & Seasonality
daily_profit = merged.groupby('date')['profit'].sum()
plt.figure(figsize=(10,4)); daily_profit.plot(); plt.title('Daily Profit Over Time')
plt.xlabel('Date'); plt.ylabel('Profit'); plt.tight_layout()
plt.savefig('daily_profit_timeseries.png'); plt.show()

df_month = merged.copy()
df_month['month'] = df_month['date'].dt.to_period('M')
monthly_sales = df_month.groupby('month')['units_sold'].sum()
monthly_profit = df_month.groupby('month')['profit'].sum()
plt.figure(figsize=(10,4))
monthly_sales.plot(marker='o', label='Sales')
monthly_profit.plot(marker='o', label='Profit')
plt.title('Monthly Sales vs Profit')
plt.xlabel('Month'); plt.ylabel('Amount')
plt.legend(); plt.tight_layout()
plt.savefig('monthly_sales_profit.png'); plt.show()

stl = STL(daily_profit, period=30).fit()
_ = stl.plot(); plt.tight_layout()
plt.savefig('stl_decomposition.png'); plt.show()
print("STL Decomposition Insights:")
print(f"Trend strength: {stl.trend.std() / daily_profit.std():.2f}")
print(f"Seasonality strength: {stl.seasonal.std() / daily_profit.std():.2f}")


# Part 5: Insights & Recommendations

# Store profitability ranking
store_profitability = merged.groupby('store')['profit'].sum().sort_values(ascending=False)
print("\nStore profitability ranking:\n", store_profitability)

# Product sales ranking
product_sales_ranking = merged.groupby('product')['units_sold'].sum().sort_values(ascending=False)
print("\nProduct sales ranking:\n", product_sales_ranking)

# Identify the best-performing product model based on R²
best_product = max(results, key=lambda p: results[p]['r2'])
best_r2 = results[best_product]['r2']
print(f"\nBest model: Product {best_product}, R²={best_r2:.3f}")

# STL Decomposition Insights
print("\nSTL Decomposition Insights:")
print(f"Trend strength: {stl.trend.std() / daily_profit.std():.2f}")
print(f"Seasonality strength: {stl.seasonal.std() / daily_profit.std():.2f}")


# Forecasting Future Sales for the Best Performing Product
# Prepare last product dataframe
dfp_best = merged[merged['product'] == best_product].groupby('date')['units_sold'].sum().reset_index()
dfp_best = add_features(dfp_best)
dfp_best['y_log'] = np.log1p(dfp_best['units_sold'])

# Prepare feature names used
feats = ['day_num', 'month_sin', 'month_cos', 'dow_sin', 'dow_cos', 'lag1', 'lag7', 'rolling7']

# Fit scaler and polynomial transformer again (optional: you can save from earlier steps)
scaler = StandardScaler()
Xs_best = scaler.fit_transform(dfp_best[feats])

poly = PolynomialFeatures(degree=results[best_product]['deg'], include_bias=False)
Xp_best = poly.fit_transform(Xs_best)

# Train Ridge model again
model_final = Ridge(alpha=1.0)
model_final.fit(Xp_best, dfp_best['y_log'])

# Forecast future dates (next 30 days)
future_dates = pd.date_range(start=dfp_best['date'].max() + pd.Timedelta(days=1), periods=30)
future = pd.DataFrame({'date': future_dates})

# Generate future features
future = add_features(future.assign(units_sold=dfp_best['units_sold'].iloc[-1]))
X_future = scaler.transform(future[feats])
Xpf_future = poly.transform(X_future)

# Predict
future_predictions = np.expm1(model_final.predict(Xpf_future))

# Create future forecast DataFrame
future_forecast = pd.DataFrame({'Date': future_dates, 'Predicted Units Sold': future_predictions})
print("\nFuture Sales Forecast for Best Product (Next 30 Days):\n", future_forecast)

# Plot historical and forecasted units sold
plt.figure(figsize=(12,6))
plt.plot(dfp_best['date'], dfp_best['units_sold'], label='Historical Units Sold')
plt.plot(future_forecast['Date'], future_forecast['Predicted Units Sold'], linestyle='--', label='Forecasted Units Sold')
plt.xlabel('Date')
plt.ylabel('Units Sold')
plt.title(f'Forecasted Units Sold for Product {best_product} (Next 30 Days)')
plt.legend()
plt.tight_layout()
plt.show()

# Recommendations

print("\nRecommendations:")
print("- Increase inventory stock for Product 1 (highest sales volume) to meet high customer demand.")
print("- Implement dynamic inventory management for top 3 products and top 2 stores based on sales and profitability rankings.")
print(f"- Reassess pricing strategy for Product {least_prof} (lowest profit margin) to improve overall profitability.")
print("- Continue using polynomial Ridge regression models with lag, rolling mean, and cyclical features for accurate forecasting.")
print("- Perform hyperparameter tuning (e.g., adjusting Ridge alpha) and consider integrating external factors such as promotions, holidays, and economic indicators to improve forecast precision.")

Data Quality Checks:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19710 entries, 0 to 19709
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           19710 non-null  datetime64[ns]
 1   store_name     19710 non-null  object        
 2   product        19710 non-null  int64         
 3   units_sold     19710 non-null  int64         
 4   store          19710 non-null  object        
 5   cost_price     19710 non-null  float64       
 6   selling_price  19710 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 1.1+ MB
None
                      date       product    units_sold    cost_price  \
count                19710  19710.000000  19710.000000  19710.000000   
mean   2022-07-02 00:00:00      3.500000     82.050127     24.865000   
min    2021-01-01 00:00:00      1.000000      0.000000      6.340000   
25%    2021-10-01 00:00:00      2.000000     34.000000      8.470000   
50%    2022-07-02 00:00:00      3.500000     64.000000     22.610000   
75%    2023-04-02 00:00:00      5.000000    114.000000     39.510000   
max    2023-12-31 00:00:00      6.000000    299.000000     49.650000   
std                    NaN      1.707868     65.529818     17.394952   

       selling_price  
count   19710.000000  
mean       53.823333  
min        20.940000  
25%        25.160000  
50%        55.805000  
75%        80.680000  
max        84.550000  
std        27.508899  
Duplicates: 0

Distribution Analysis (Anderson-Darling) & Visuals:
Product 1: stat=39.42, crit@5%=0.786

Product 2: stat=37.15, crit@5%=0.786

Product 3: stat=37.81, crit@5%=0.786

Product 4: stat=40.97, crit@5%=0.786

Product 5: stat=39.79, crit@5%=0.786

Product 6: stat=42.41, crit@5%=0.786

Summary Statistics & Visuals:
            sum                    mean                
store   Store_A Store_B Store_C Store_A Store_B Store_C
product                                                
1        193593  189893  189861  176.80  173.42  173.39
2        123677  123422  124330  112.95  112.71  113.54
3         91783   92962   91723   83.82   84.90   83.77
4         59878   59696   58724   54.68   54.52   53.63
5         46836   45038   46040   42.77   41.13   42.05
6         26623   26525   26604   24.31   24.22   24.30
             sum   mean    std
date                          
2021-01-01  1312  72.89  58.75
2021-01-02  1648  91.56  93.91
2021-01-03  1771  98.39  66.30
2021-01-04  1646  91.44  71.33
2021-01-05  1573  87.39  65.00
Store sales:
 store
Store_A    542390
Store_B    537536
Store_C    537282
Name: units_sold, dtype: int64
Store profit:
 store
Store_A    16138784.98
Store_B    15939994.94
Store_C    15951199.23
Name: profit, dtype: float64

Most profitable product: 1, Least profitable product: 4

Cumulative Profit per Product & Store:
product  store  
1        Store_A    8063148.45
         Store_B    7909043.45
         Store_C    7907710.65
2        Store_A    2327601.14
         Store_B    2322802.04
         Store_C    2339890.60
3        Store_A    2270711.42
         Store_B    2299879.88
         Store_C    2269227.02
4        Store_A     746678.66
         Store_B     744409.12
         Store_C     732288.28
5        Store_A    1634576.40
         Store_B    1571826.20
         Store_C    1606796.00
6        Store_A    1096068.91
         Store_B    1092034.25
         Store_C    1095286.68
Name: profit, dtype: float64
Product 1: degree=2, R²=0.138, RMSE=125.45, MAE=104.81

Product 2: degree=2, R²=0.069, RMSE=85.60, MAE=70.17

Product 3: degree=2, R²=0.108, RMSE=66.14, MAE=54.01