In [1]:
# Importing all necessary libraries
import pandas as pd
import json
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
import nltk
import seaborn as sns
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Ensure NLTK resources are downloaded
nltk.download('stopwords')  
nltk.download('punkt')  

# Function to load and parse JSON data into a DataFrame
def load_data(file_path):
    """
    Reads a JSON file and converts it into a Pandas DataFrame.
    Args:
        file_path (str): Path to the JSON file.
    Returns:
        pd.DataFrame: A DataFrame containing the JSON data.
    """
    print("Loading dataset...")  
    try:
        data = pd.read_json(file_path, lines=True)
        print("Dataset loaded successfully!")
        return data
    except ValueError as e:
        print(f"Error loading JSON file: {e}")
        return pd.DataFrame()

# Function to create visualizations
def create_visualizations(data, asin):
    """
    Generate visualizations for a given product dataset.
    Args:
        data (DataFrame): Product-specific data.
        asin (str): Product ID.
    """
    # Bar Plot for Ratings Distribution
    plt.figure(figsize=(10, 6))  
    sns.countplot(data=data, x='overall', palette='coolwarm')
    plt.title(f"Ratings Distribution for Product {asin}", fontsize=14)
    plt.xlabel("Ratings")
    plt.ylabel("Count")
    plt.savefig(f'ratings_distribution_{asin}.png')
    plt.show()

    # Word Cloud for Reviews
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    all_words = " ".join(data['reviewText'])
    tokens = [stemmer.stem(word) for word in tokenizer.tokenize(all_words.lower()) if word not in stop_words]
    word_freq = Counter(tokens)

    # Introducing variation by altering word frequencies slightly
    for key in word_freq.keys():
        word_freq[key] += 1  # Adding a constant to all frequencies for randomness

    wordcloud = WordCloud(width=800, height=400, background_color='black', colormap='viridis').generate_from_frequencies(word_freq)
    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Word Cloud for Product {asin}", fontsize=14)
    plt.savefig(f'wordcloud_{asin}.png')
    plt.show()

    # Sentiment Distribution for the profucts (1st commit as comment)
    plt.figure(figsize=(10, 6))  
    sns.histplot(data['sentiment'], kde=True, color='purple', bins=20)
    plt.title(f"Sentiment Analysis for Product {asin}", fontsize=14)
    plt.xlabel("Sentiment Score")
    plt.ylabel("Frequency")
    plt.savefig(f'sentiment_distribution_{asin}.png')
    plt.show()

    # Sentiment Trend Over Time
    data['reviewTime'] = pd.to_datetime(data['reviewTime'])
    monthly_sentiment = data.groupby(data['reviewTime'].dt.to_period("M"))['sentiment'].mean()
    monthly_sentiment.plot(kind='line', marker='o', figsize=(10, 6), color='orange')  # pandas plot function
    plt.title(f"Sentiment Trend Over Time for Product {asin}")
    plt.xlabel("Month")
    plt.ylabel("Average Sentiment Score")
    plt.grid(True)
    plt.savefig(f'sentiment_trend_{asin}.png')
    plt.show()

    # Bubble Chart: Ratings vs Votes
    print(f"Creating Bubble Chart for {asin}...")
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x=data['vote'], 
        y=data['overall'], 
        size=data['vote'], 
        sizes=(50, 500), 
        hue=data['overall'], 
        palette='cool',    # color and size change of bubble chart another commit
        alpha=0.7
    )
    plt.title(f"Bubble Chart: Ratings vs Votes for Product {asin}", fontsize=14)
    plt.xlabel("Votes")
    plt.ylabel("Ratings")
    plt.legend(title="Ratings", loc='upper left', bbox_to_anchor=(1, 1))
    plt.grid(True)
    plt.savefig(f'bubble_chart_{asin}.png')
    plt.show()

    # Interactive Dashboard with Plotly
    print("Creating interactive dashboard...")  
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Ratings vs Votes", "Sentiment Distribution"))

    fig.add_trace(
        go.Scatter(x=data['vote'], y=data['overall'], mode='markers',
                   marker=dict(
            size=data['vote'],  # Bubble size directly proportional to votes
            color=data['overall'],  # Used ratings for color
            colorscale='Cividis',  # Updated color scale
            showscale=True),
                   name="Votes vs Ratings"),
        row=1, col=1
    )

    fig.add_trace(
        go.Histogram(x=data['sentiment'], nbinsx=20, marker_color='teal', name="Sentiment"),
        row=1, col=2
    )

    fig.update_layout(height=600, width=1000, title_text=f"Dashboard for Product {asin}")
    fig.write_html(f'interactive_dashboard_{asin}.html')
    fig.show()

# File path to the dataset
file_path = "D:\Portfolio 2.0\Datasets\Industrial_and_Scientific.json"

# Load dataset
reviews_df = load_data(file_path)

# Data Cleaning and Preprocessing
print("Data cleaning and preprocessing started...")
columns_to_keep = ['asin', 'reviewText', 'overall', 'unixReviewTime', 'summary', 'reviewerName', 'vote']
reviews_df = reviews_df[columns_to_keep]

# Handle missing values and clean the 'vote' column
reviews_df['reviewerName'] = reviews_df['reviewerName'].fillna("Anonymous")
reviews_df['vote'] = reviews_df['vote'].fillna(0).astype(str).str.replace(',', '').astype(int)
reviews_df.dropna(subset=['reviewText', 'overall'], inplace=True)

# Convert UNIX timestamp to readable date
reviews_df['reviewTime'] = pd.to_datetime(reviews_df['unixReviewTime'], unit='s')
print("Data cleaning completed!")

# Analyze and Visualize Data
products = reviews_df['asin'].unique()[3:6]  # Selecting a unique range of three products for analysis
product_data = {asin: reviews_df[reviews_df['asin'] == asin] for asin in products}

# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer() 

for asin, data in product_data.items():
    print(f"\nAnalyzing Product: {asin}")

    # Sentiment Scores
    data['sentiment'] = data['reviewText'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

    # Generate Visualizations
    create_visualizations(data, asin)

print("Result is saved as analysis is completed.")

# Save Cleaned Data
output_file = 'J119811_updated_with_dashboard.csv'  
reviews_df.to_csv(output_file, index=False)
print(f"Cleaned dataset saved to {output_file}.")



# References
# Plotly. (n.d.). https://plotly.com/python/
# View of VADER: A Parsimonious Rule-Based Model for Sentiment Analysis of Social Media Text. (n.d.). 
#     https://ojs.aaai.org/index.php/ICWSM/article/view/14550/14399
# Waskom, M. (2021). seaborn: statistical data visualization. The Journal of Open Source Software, 6(60), 3021.
#     https://doi.org/10.21105/joss.03021
# Gallery of Examples — wordcloud 1.8.1 documentation. (n.d.). 
#     https://amueller.github.io/word_cloud/auto_examples/index.html
# MatPlotLib: a 2D Graphics environment. (2007, June 1). IEEE Journals & Magazine | IEEE Xplore. 
#     https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4160265
<>:144: SyntaxWarning: invalid escape sequence '\P'
<>:144: SyntaxWarning: invalid escape sequence '\P'
C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:144: SyntaxWarning: invalid escape sequence '\P'
  file_path = "D:\Portfolio 2.0\Datasets\Industrial_and_Scientific.json"
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\s\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Loading dataset...
Dataset loaded successfully!
Data cleaning and preprocessing started...
Data cleaning completed!

Analyzing Product: 1587792052
C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:174: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'] = data['reviewText'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:51: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=data, x='overall', palette='coolwarm')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:89: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['reviewTime'] = pd.to_datetime(data['reviewTime'])
No description has been provided for this image
Creating Bubble Chart for 1587792052...
No description has been provided for this image
Creating interactive dashboard...
Analyzing Product: 6565984549
C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:174: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:51: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:89: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

No description has been provided for this image
Creating Bubble Chart for 6565984549...
No description has been provided for this image
Creating interactive dashboard...
Analyzing Product: 8803000844
C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:174: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:51: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.


No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
C:\Users\s\AppData\Local\Temp\ipykernel_12012\1201817223.py:89: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

No description has been provided for this image
Creating Bubble Chart for 8803000844...
No description has been provided for this image
Creating interactive dashboard...
Result is saved as analysis is completed.
Cleaned dataset saved to J119811_updated_with_dashboard.csv.