import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Loading data
skin_reviews = pd.read_csv('C:/Users/Samsung/Ulta-Skincare-Reviews.csv')


#Overview
skin_reviews.head()


skin_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4150 entries, 0 to 4149
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Review_Title      4150 non-null   object
 1   Review_Text       4148 non-null   object
 2   Verified_Buyer    4150 non-null   object
 3   Review_Date       4150 non-null   object
 4   Review_Location   4149 non-null   object
 5   Review_Upvotes    4150 non-null   int64 
 6   Review_Downvotes  4150 non-null   int64 
 7   Product           4150 non-null   object
 8   Brand             4150 non-null   object
 9   Scrape_Date       4150 non-null   object
dtypes: int64(2), object(8)
memory usage: 324.3+ KB


skin_reviews[skin_reviews['Review_Text'].isnull()]


skin_reviews.dropna(inplace = True)


print(
    'samples:\n\n', skin_reviews.iloc[0][1],\
    '\n\n', skin_reviews.iloc[17][1], \
    '\n\n', skin_reviews.iloc[151][1], \
    '\n\n', skin_reviews.iloc[633][1], \
    '\n\n', skin_reviews.iloc[1200][1], \
    '\n\n', skin_reviews.iloc[2200][1], \
     '\n\n', skin_reviews.iloc[3040][1], 
)

samples:

 Love using this on my face while in the shower. Heats up and gives a light scrub nicely 

 Great product and your skin feels like silk after use. 

 Dermalogica's products have always been a hit or miss for me but I'm glad to say that this product is one of the exceptions! Is the few seconds 'warming' sensation when you apply this a bit gimmicky? I used to think so at first, but as I've continued to use it, I'm starting to think this is the one quality that sets this one apart from so many other great exfoliators I've tried, heat helping to open up pores and all that. And of course the ingredients working in these open pores have to be the part too and these definitely are...Retinol, AHA's, BHA's and some good botanicals. The exfoliating granules are finely milled, the Licorice and Vitamin C (combined with the acids) give a fantastic brightening effect, and most importantly my skin feels perfectly smooth afterwards. I love how you're always in control with the amount of exfoliation, on dry skin it works most intensely, but add water if you wish to make it less intense, and the more water you add, the less it will be. Just over 1 minute of gently working this into my skin and I'm done, and I only use a small amount, you don't need lots as you do with so many other scrubs. I like the fragrance of this product and despite the oily consistency of it, I find it adds a bit of hydration into my skin which sometimes (being oily/combination) is just about all the hydration I need. There is a slight residue, but once I've towel dried my face it all but disappears. Pricey yes, but as I mentioned a little goes a long way, so be expecting this to last longer than you expected! -> My 'Ulta Beauty Top 7 must-have list' (My 7 favorite beauty products of all time, focusing mainly on my purchases at Ulta) < - < 1. Hair Loss Treatment: Divine Herbal Hair Oil < 2. Eye Cream: Clinique All about eyes < 3. Lip Gloss: Clarins Lip Comfort Oil < 4. Makeup Remover: Lancome Bi Facil < 5. Pigmentation + Anti-Aging Serum: Divine Herbal Facial Treatment Serum < 6. Eyeliner: Ulta Beauty Gel Eyeliner Pencil < 7. Moisturizer: 'Lush' – Enzymion 

 It does it's job but it's not the most spectacular product I've used. It is kind of drying so you definitely have to make sure you moisturize your skin after. It leaves my dry skin a little too dry and tight for my liking. 

 I am always Leary of using a cream cleanser I really like the feel of a soap but amazingly enough this works like a charm! My skin felt so clean after using it. I only ordered it this time rather than the daily cleanser because my skin has been so dry this winter but this has certainly helped! 

 I like the way this gently exfoliates my skin. The first use, I did pour out too much, but I found that the secret is to turn the container so that the tab over the dispenser hole comes down from the top side, which slows the flow. I've been able to dispense just the right amount each time this way. Being a powder, this will be a great product for air travel and is also good for the gym bag/locker. 

 This product was gifted by Dermalogica, but it's the 5th bottle for me! I've purchased this exfoliator again and again because it gently exfoliate ms my skin, without sensitising or stripping my moisture barrier. I think it's the best physical exfoliant I've ever used. I have combination skin and pores that clog easily, so I use this almost daily.


# Review Titles
print(
    skin_reviews['Review_Title'].value_counts().sort_values(ascending = False)
)

Amazing product                                         95
Love this mask                                          90
It's hydrating                                          89
Buy it!                                                 89
Would absolutely buy this again                         89
                                                        ..
Wow I love this!                                         1
Burned my skin                                           1
Warms my skin in a good way                              1
Definitely keeping this in my daily skincare routine     1
Love!!!                                                  1
Name: Review_Title, Length: 2825, dtype: int64


#Check if all of scraping column is the same date:
print('Is all scraping from the same day?')
len(skin_reviews['Scrape_Date'].unique()) == 1

Is all scraping from the same day?

True


#scrubbing review date: which suffix describes the shifts from scrape_date?
skin_reviews['Review_Date'].apply(lambda x: x.split(' ')[1]).unique()

array(['days', 'months', 'year', 'years', 'month'], dtype=object)


from pandas import to_datetime
from dateutil.relativedelta import relativedelta

def extract_data(review_date):
    '''
    This function extracts the date from which the review was made. 
    
    This dataset informs how long the review has been mande since the web scrapping. 
    
    It takes the magnitude of the shift (integer), and the dimension (days, months, years), and returns
    the date of the review
    '''
    #get scrapping date (does not matter, it is all the same)
    ref = pd.to_datetime(max(skin_reviews['Scrape_Date']), format='%m/%d/%y') 
    qty, shift = int(review_date.split(' ')[0]), review_date.split(' ')[1]
    
    if shift in ('days'):
        return ref - pd.Timedelta(days = qty)
    elif shift in ('months', 'month'):
        return ref - relativedelta(months = qty)
    elif shift in ('year', 'years'):
        return ref - relativedelta(years = qty)
    else:
        return ref
    
    
skin_reviews['Date_of_Review'] = skin_reviews['Review_Date'].apply(lambda x:extract_data(x))


#convert to datetime
skin_reviews['Date_of_Review'] = skin_reviews['Date_of_Review'].apply(lambda x: pd.to_datetime(x, format='%m-%d-%y'))


#No need for this column anymore
skin_reviews.drop('Scrape_Date', axis = 1, inplace = True)


skin_reviews['text_size'] = skin_reviews['Review_Text'].apply(lambda x: len(x))
skin_reviews['text_size'].sort_values(ascending = False)

151     1998
2589    1988
2472    1883
205     1780
2536    1746
        ... 
1457       5
3560       4
1701       4
457        1
3816       1
Name: text_size, Length: 4147, dtype: int64


plt.title('Histogram of review texts length')
sns.histplot(data = skin_reviews, x = 'text_size', kde = True, binwidth = 150)

<AxesSubplot:title={'center':'Histogram of review texts length'}, xlabel='text_size', ylabel='Count'>


print('Brands in Dataset: ', skin_reviews['Brand'].nunique())
print('Products in Dataset', skin_reviews['Product'].nunique())

Brands in Dataset:  1
Products in Dataset 4


sns.set_context('notebook')
sns.set_style('darkgrid')
plt.figure(figsize=(10,5))
plt.title('Products with the most reviews', fontsize = 20)

sns.countplot(x = skin_reviews['Product'],
              order = ['Multi-Vitamin Thermafoliant',
                       'Daily Microfoliant',
                       'Daily Superfoliant',
                       'Hydro Masque Exfoliant'])

<AxesSubplot:title={'center':'Products with the most reviews'}, xlabel='Product', ylabel='count'>


verified = pd.DataFrame(skin_reviews['Verified_Buyer'].value_counts())
tmp = verified['Verified_Buyer'].sum()

verified['%'] = verified['Verified_Buyer'].apply(lambda x: f'{x/tmp:.2f}')
verified


skin_reviews['month_review'] = skin_reviews['Date_of_Review'].apply(lambda x: to_datetime(x).month)
skin_reviews['year_review'] = skin_reviews['Date_of_Review'].apply(lambda x: to_datetime(x).year)
skin_reviews['mo-yr'] = (skin_reviews['month_review']).astype(str) + '-' + (skin_reviews['year_review']).astype(str)


sns.set_context('notebook')
sns.set_style('darkgrid')
plt.figure(figsize=(10,5))
plt.title('Products with the most reviews', fontsize = 20)
sns.countplot(x = skin_reviews['Product'], hue = skin_reviews['Verified_Buyer'], palette = 'tab10',
              #x = skin_reviews[skin_reviews['Verified_Buyer'] == 'Yes']['Product'],
              order = ['Multi-Vitamin Thermafoliant',
                       'Daily Microfoliant',
                       'Daily Superfoliant',
                       'Hydro Masque Exfoliant'])

<AxesSubplot:title={'center':'Products with the most reviews'}, xlabel='Product', ylabel='count'>


#skin_reviews.sort_values('Date_of_Review', inplace = True)
df = pd.DataFrame(skin_reviews.groupby('mo-yr')['Review_Title'].count().reset_index())
df = df.merge(skin_reviews[['mo-yr', 'month_review', 'year_review']].drop_duplicates(), on ='mo-yr')

visual_df = df.sort_values(['year_review', 'month_review'], ascending = True).set_index('mo-yr').drop(['month_review', 'year_review'], axis = 1)


sns.set_context('notebook')
plt.figure(figsize=(20,5))
plt.title('Reviews per month', fontsize = 25)
sns.lineplot(x = visual_df.index, y = visual_df.Review_Title, lw = 3 )
sns.lineplot(x = visual_df.index, y = 0, lw =2, ls = 'dotted')
plt.xlabel('Month-Year')
plt.ylabel('Number of Reviews')

Text(0, 0.5, 'Number of Reviews')


sns.set_context('notebook')
plt.figure(figsize=(20,5))
plt.title('Reviews per month', fontsize = 25)
sns.lineplot(x = visual_df.loc['4-2022':].index, y = visual_df.loc['4-2022':].Review_Title, lw = 3 )
sns.lineplot(x = visual_df.loc['4-2022':].index, y = 0, lw =2, ls = 'dotted')
plt.xlabel('Month-Year')
plt.ylabel('Number of Reviews')

Text(0, 0.5, 'Number of Reviews')


from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import downloader
#downloader.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

results = {}

for i, row in skin_reviews.iterrows():
    text = row['Review_Text']
    results[i] = sia.polarity_scores(text)


results = pd.DataFrame(results).T
results


vader_df = results.reset_index().join(skin_reviews[['Review_Title', 'Review_Text', 'Verified_Buyer', 'Product', 'mo-yr']]).\
           drop(labels = 'index', axis = 1)
vader_df


vader_df.loc[(vader_df['neg'] > 0.35),['neg','pos', 'Review_Text']]


print(
    ' 1 ', vader_df.loc[409,'Review_Text'], '\n',
    '2 ', vader_df.loc[500,'Review_Text'], '\n',
    '3 ', vader_df.loc[1717,'Review_Text'], '\n',
    '4 ', vader_df.loc[1929,'Review_Text'],'\n',
    '5 ', vader_df.loc[2372,'Review_Text'],'\n',
    '6 ', vader_df.loc[3466,'Review_Text'], '\n'
)

 1  This is non irritating 
 2  I would not recommend this product for people with sensitive skin . It broke my face and neck out terribly. It was a rash like breakout and burned so bad . 
 3  Keeps the dead skin cells away 
 4  Smells horrific. Won't repurchase. 
 5  I'm not impressed by it at all. It's messy and I don't like the application. 
 6  I really wish this worked for me because I love the precleanse and special cleansing gel. This broke me out so bad and caused my skin to have this weird uneven texture, though. I stopped using it a few days ago and went back to my very simple routine I have had for the past 15yrs. I'm already glowing. My texture has improved exponentially and the acne is gone except 3 tiny bumps. I would suggest buying the travel size to test out first.


vader_df.loc[(vader_df['pos'] > 0.35),['neg','pos', 'Review_Text']]


fig, ax = plt.subplots(1, 3, sharex = True, figsize=(16,3))


sns.countplot(data = vader_df, x = vader_df['Product'].loc[(vader_df['neg'] > 0.35)], ax = ax[0], )
ax[0].set_title('Negative reviews')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=90)

sns.countplot(data = vader_df,
              x = vader_df['Product'].loc[(vader_df['neu'] > 0.35)], ax = ax[1])
ax[1].set_title('Neutral reviews')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=90)


sns.countplot(data = vader_df,
              x = vader_df['Product'].loc[(vader_df['pos'] > 0.35)], ax = ax[2])
ax[2].set_title('Positive reviews')
ax[2].set_xticklabels(ax[2].get_xticklabels(), rotation=90)

[Text(0, 0, 'Multi-Vitamin Thermafoliant'),
 Text(1, 0, 'Hydro Masque Exfoliant'),
 Text(2, 0, 'Daily Superfoliant'),
 Text(3, 0, 'Daily Microfoliant')]


#Reviews Predominently negative, neutral or positive

fig, ax = plt.subplots(1, 3, sharex = True, figsize=(15,3))



sns.countplot(data = vader_df,
              x = vader_df['Product'].loc[(vader_df['neg'] > vader_df['pos']) & (vader_df['neg'] > vader_df['neu'])],
              ax = ax[0],
              palette = 'tab10'
             )
ax[0].set_title('Negative reviews')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=90)



sns.countplot(data = vader_df,
              x = vader_df['Product'].loc[(vader_df['neu'] > vader_df['neg']) & (vader_df['neu'] > vader_df['pos'])],
              ax = ax[1],
              palette = 'tab10'
             )
ax[1].set_title('Neutral reviews')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=90)




sns.countplot(data = vader_df,
              x = vader_df['Product'].loc[(vader_df['pos'] > vader_df['neg']) & (vader_df['pos'] > vader_df['neu'])], 
              ax = ax[2],
              palette = 'tab10'
             )
ax[2].set_title('Positive reviews')
ax[2].set_xticklabels(ax[2].get_xticklabels(), rotation=90)

[Text(0, 0, 'Multi-Vitamin Thermafoliant'),
 Text(1, 0, 'Hydro Masque Exfoliant'),
 Text(2, 0, 'Daily Superfoliant'),
 Text(3, 0, 'Daily Microfoliant')]


def classify(df):
    if (df['neg'] >= df['neu']) & (df['neg'] >= df['pos']):
        return 'negative'
    elif(df['pos'] >= df['neg']) & (df['pos'] >= df['neu']):
        return 'positive'
    else:
        return 'neutral'

    

vader_df['class'] = vader_df.apply(lambda x: classify(x), axis = 1)


vader_df = vader_df.merge(pd.get_dummies(vader_df['class']), how = 'left', on = vader_df.index)
vader_df.drop('key_0',axis = 1, inplace = True)


group_df = pd.DataFrame(vader_df.groupby('mo-yr', axis = 0)[['negative', 'positive', 'neutral']].sum())
group_df_order = visual_df.reset_index().merge(group_df, on = 'mo-yr').set_index('mo-yr')


plt.figure(figsize=(20,5))
plt.title('Reviews per month', fontsize = 25)
sns.lineplot(x = group_df_order.loc['4-2022':].index, y = group_df_order.loc['4-2022':]['negative'], lw = 3, label = 'neg', color = 'red' )
sns.lineplot(x = group_df_order.loc['4-2022':].index, y = group_df_order.loc['4-2022':]['neutral'], lw = 3, label = 'neutral', color = 'gray' )
sns.lineplot(x = group_df_order.loc['4-2022':].index, y = group_df_order.loc['4-2022':]['positive'], lw = 3 , label = 'pos', color = 'green')
plt.legend()

<matplotlib.legend.Legend at 0x22ceebf2af0>


from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch

BERTtokens = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")


model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")


from scipy.special import softmax

def polar_scores(row):
    encoded = BERTtokens(row, return_tensors = 'pt')
    result = model(**encoded)
    scores = result[0][0].detach().numpy()
    scores = softmax(scores)
    weights = {
        'negative':scores[0],
        'neutral':scores[1],
        'positive':scores[2]
    }
    return weights


BERT_result = {}

for i, row in skin_reviews.iterrows():
    try:
        text = row['Review_Text']
        BERT_result[i] = polar_scores(text)
    except RunTimeError:
        print('error in row:  ', i)


BERT_result = pd.DataFrame(BERT_result).T
BERT_result


df_combined = BERT_result.join(results, rsuffix= 'vader', lsuffix = 'BERT')
df_combined.columns = ['BERT_neg', 'BERT_neutral', 'BERT_pos', 'vader_neg', 'vader_neutral', 'vader_pos', 'vader_comp']


df_combined


def votes(upvotes):
    if upvotes > 1:
        return 'upvoted'
    else:
        return 'no-votes'
    

df_combined['upvotes'] = skin_reviews['Review_Upvotes'].apply(lambda x: votes(x))
df_combined


sns.pairplot(data = df_combined, hue = 'upvotes', palette = 'magma')

<seaborn.axisgrid.PairGrid at 0x22c86675700>

	Verified_Buyer	%
No	2900	0.70
Yes	1247	0.30

	neg	neu	pos	compound
0	0.000	0.679	0.321	0.7964
1	0.102	0.723	0.175	0.2960
2	0.000	0.583	0.417	0.8347
3	0.031	0.749	0.220	0.8636
4	0.227	0.618	0.156	-0.3430
...	...	...	...	...
4145	0.000	0.580	0.420	0.4404
4146	0.066	0.825	0.110	0.7163
4147	0.000	0.734	0.266	0.4404
4148	0.115	0.783	0.102	0.0518
4149	0.000	0.799	0.201	0.8209

	neg	neu	pos	compound	Review_Title	Review_Text	Verified_Buyer	Product	mo-yr
0	0.000	0.679	0.321	0.7964	Perfect	Love using this on my face while in the shower...	No	Multi-Vitamin Thermafoliant	3-2023
1	0.102	0.723	0.175	0.2960	You need this	Even better than the daily microfoliant. I'm o...	No	Multi-Vitamin Thermafoliant	2-2023
2	0.000	0.583	0.417	0.8347	Clean skin	Enjoy this product so much ! I look forward to...	No	Multi-Vitamin Thermafoliant	1-2023
3	0.031	0.749	0.220	0.8636	Love This Stuff!	I've never tried anything like this before and...	No	Multi-Vitamin Thermafoliant	1-2023
4	0.227	0.618	0.156	-0.3430	This exfoliates very nicely and	This exfoliates very nicely and gives a very s...	No	Multi-Vitamin Thermafoliant	1-2023
...	...	...	...	...	...	...	...	...	...
4142	0.000	0.580	0.420	0.4404	Love this product!!	Love, love, love the feel of my skin after I u...	Yes	Daily Microfoliant	3-2016
4143	0.066	0.825	0.110	0.7163	Great daily exfoliator.	After trying a sample of this product I decide...	Yes	Daily Microfoliant	3-2016
4144	0.000	0.734	0.266	0.4404	Facials to go	Simple pouring none abrasive	Yes	Daily Microfoliant	3-2016
4145	0.115	0.783	0.102	0.0518	I would buy this product again	Much better product than a scrub.	Yes	Daily Microfoliant	3-2016
4146	0.000	0.799	0.201	0.8209	Gentle exfoliant- leaves skin smooth & soft	I've been using this exfoliant for a few month...	No	Daily Microfoliant	3-2016

	neg	pos	Review_Text
409	0.500	0.000	This is non irritating
500	0.401	0.064	I would not recommend this product for people ...
1717	0.462	0.000	Keeps the dead skin cells away
1929	0.595	0.000	Smells horrific. Won't repurchase.
2372	0.394	0.000	I'm not impressed by it at all. It's messy and...
3466	0.395	0.275	I really wish this worked for me because I lov...

	neg	pos	Review_Text
2	0.0	0.417	Enjoy this product so much ! I look forward to...
6	0.0	0.818	Absolutely love
8	0.0	0.558	Definitely love it! My skin face feels better ...
10	0.0	0.412	Love this product to exfoliate the skin.
13	0.0	0.712	Great product. Always feel gently exfoliated. ...
...	...	...	...
4112	0.0	0.375	I use this product before bed and leaves my sk...
4115	0.0	0.353	I love how my face feels after using this.
4117	0.0	0.506	The is one product worth the money and I alway...
4139	0.0	0.594	I noticed a difference almost immediately! I h...
4142	0.0	0.420	Love, love, love the feel of my skin after I u...

Sentiment Analysis of Skincare Product Reviews using NLP¶

Overview¶

1. Project Set-up¶

1.1 First look¶

1.2. Data types and null values¶

2. Feature extraction & EDA¶

Brands and Products¶

3. VADER Model¶

- Setting up texts using NLTK¶

Taking a Deeper look Negative Reviews:¶

Taking a deeper look Positive Reviews :¶

3. BERT model¶

4. Conclusion¶

	Review_Title	Review_Text	Verified_Buyer	Review_Date	Review_Location	Product	Brand	Scrape_Date
0	Perfect	Love using this on my face while in the shower...	No	15 days ago	Undisclosed	Multi-Vitamin Thermafoliant	Dermalogica	3/27/23
1	You need this	Even better than the daily microfoliant. I'm o...	No	27 days ago	Undisclosed	Multi-Vitamin Thermafoliant	Dermalogica	3/27/23
2	Clean skin	Enjoy this product so much ! I look forward to...	No	2 months ago	Undisclosed	Multi-Vitamin Thermafoliant	Dermalogica	3/27/23
3	Love This Stuff!	I've never tried anything like this before and...	No	2 months ago	Undisclosed	Multi-Vitamin Thermafoliant	Dermalogica	3/27/23
4	This exfoliates very nicely and	This exfoliates very nicely and gives a very s...	No	2 months ago	Undisclosed	Multi-Vitamin Thermafoliant	Dermalogica	3/27/23

	Review_Title	Review_Text	Verified_Buyer	Review_Date	Review_Location	Review_Upvotes	Review_Downvotes	Product	Brand	Scrape_Date
3684	Received a sample and loved it!	NaN	Yes	4 years ago	Columbia, SC	0	0	Daily Microfoliant	Dermalogica	3/27/23
3686	This product works	NaN	Yes	4 years ago	Columbia, SC	0	0	Daily Microfoliant	Dermalogica	3/27/23

	negative	neutral	positive
0	0.001830	0.026186	0.971984
1	0.002205	0.010915	0.986879
2	0.001785	0.004618	0.993598
3	0.003368	0.013725	0.982907
4	0.001623	0.013866	0.984511
...	...	...	...
4145	0.176956	0.406029	0.417014
4146	0.085561	0.312467	0.601972
4147	0.003988	0.046380	0.949632
4148	0.873459	0.112970	0.013571
4149	0.001571	0.011577	0.986853