# Read the .CSV file as a dataframe
import os
corpus_path = 'C:/Users/User/Downloads/DisneylandReviews.csv' # Change this path to the preferred/relevant location on your computer
os.chdir(corpus_path)
import warnings
warnings.filterwarnings('ignore') # only use this when you know the script and want to supress unnecessary warnings
import pandas as pd
df = pd.read_csv("DisneylandReviews.csv", encoding='ISO-8859-1')
df.reset_index(level=0, inplace=True)
df
index | Review_ID | Rating | Year_Month | Reviewer_Location | Review_Text | Branch | |
---|---|---|---|---|---|---|---|
0 | 0 | 670772142 | 4 | 2019-4 | Australia | If you've ever been to Disneyland anywhere you... | Disneyland_HongKong |
1 | 1 | 670682799 | 4 | 2019-5 | Philippines | Its been a while since d last time we visit HK... | Disneyland_HongKong |
2 | 2 | 670623270 | 4 | 2019-4 | United Arab Emirates | Thanks God it wasn t too hot or too humid wh... | Disneyland_HongKong |
3 | 3 | 670607911 | 4 | 2019-4 | Australia | HK Disneyland is a great compact park. Unfortu... | Disneyland_HongKong |
4 | 4 | 670607296 | 4 | 2019-4 | United Kingdom | the location is not in the city, took around 1... | Disneyland_HongKong |
... | ... | ... | ... | ... | ... | ... | ... |
42651 | 42651 | 1765031 | 5 | missing | United Kingdom | i went to disneyland paris in july 03 and thou... | Disneyland_Paris |
42652 | 42652 | 1659553 | 5 | missing | Canada | 2 adults and 1 child of 11 visited Disneyland ... | Disneyland_Paris |
42653 | 42653 | 1645894 | 5 | missing | South Africa | My eleven year old daughter and myself went to... | Disneyland_Paris |
42654 | 42654 | 1618637 | 4 | missing | United States | This hotel, part of the Disneyland Paris compl... | Disneyland_Paris |
42655 | 42655 | 1536786 | 4 | missing | United Kingdom | I went to the Disneyparis resort, in 1996, wit... | Disneyland_Paris |
42656 rows × 7 columns
# Limit the data to Disneyland Paris
df = df[(df['Branch'] == 'Disneyland_Paris')]
# Drop rows if Year_Month is missing
df = df[df.Year_Month != 'missing']
df.reset_index(level=0, inplace=True)
df
level_0 | index | Review_ID | Rating | Year_Month | Reviewer_Location | Review_Text | Branch | |
---|---|---|---|---|---|---|---|---|
0 | 29026 | 29026 | 670721950 | 5 | 2019-3 | United Arab Emirates | We've been to Disneyland Hongkong and Tokyo, s... | Disneyland_Paris |
1 | 29027 | 29027 | 670686565 | 4 | 2018-6 | United Kingdom | I went to Disneyland Paris in April 2018 on Ea... | Disneyland_Paris |
2 | 29028 | 29028 | 670606796 | 5 | 2019-4 | United Kingdom | What a fantastic place, the queues were decent... | Disneyland_Paris |
3 | 29029 | 29029 | 670586937 | 4 | 2019-4 | Australia | We didn't realise it was school holidays when ... | Disneyland_Paris |
4 | 29031 | 29031 | 670400930 | 5 | 2019-4 | United Kingdom | Such a magical experience. I recommend making ... | Disneyland_Paris |
... | ... | ... | ... | ... | ... | ... | ... | ... |
12689 | 42113 | 42113 | 92198076 | 4 | 2011-1 | United Kingdom | Although our pick up was prompt the taxi drive... | Disneyland_Paris |
12690 | 42114 | 42114 | 92061774 | 4 | 2011-1 | Germany | Just returned from a 4 days family trip to Dis... | Disneyland_Paris |
12691 | 42115 | 42115 | 91995748 | 1 | 2010-12 | United Kingdom | We spent the 20 Dec 2010 in the Disney park an... | Disneyland_Paris |
12692 | 42116 | 42116 | 91984642 | 2 | 2010-12 | United Kingdom | Well I was really looking forward to this trip... | Disneyland_Paris |
12693 | 42117 | 42117 | 91827418 | 5 | 2010-9 | United Kingdom | If staying at a Disney hotel make good use of ... | Disneyland_Paris |
12694 rows × 8 columns
# Extract the year of the visit # \d{4} is a pattern that matches with four digit numbers (which is useful to extract years from text)
df['year'] = df['Year_Month'].str.extract('(\d{4})', expand=True)
# Convert this string to a datevariable
df['datetime'] = pd.to_datetime(df['year'], errors = 'coerce')
# Add a count (this will be useful later when making the graphs)
df['count'] = 1
df
# Keep the columns that we need
df = df[['Review_Text','datetime', 'count']]
df
Review_Text | datetime | count | |
---|---|---|---|
0 | We've been to Disneyland Hongkong and Tokyo, s... | 2019-01-01 | 1 |
1 | I went to Disneyland Paris in April 2018 on Ea... | 2018-01-01 | 1 |
2 | What a fantastic place, the queues were decent... | 2019-01-01 | 1 |
3 | We didn't realise it was school holidays when ... | 2019-01-01 | 1 |
4 | Such a magical experience. I recommend making ... | 2019-01-01 | 1 |
... | ... | ... | ... |
12689 | Although our pick up was prompt the taxi drive... | 2011-01-01 | 1 |
12690 | Just returned from a 4 days family trip to Dis... | 2011-01-01 | 1 |
12691 | We spent the 20 Dec 2010 in the Disney park an... | 2010-01-01 | 1 |
12692 | Well I was really looking forward to this trip... | 2010-01-01 | 1 |
12693 | If staying at a Disney hotel make good use of ... | 2010-01-01 | 1 |
12694 rows × 3 columns
You can off course stip these steps if you already have a dataframe with a text, datetime, and count variable column.
# Word counts for 'expensive' per year (you can resample by year, month, or day,i.e. 'A-DEC', 'M', or 'D')
df['term_of_interest'] = df['Review_Text'].str.count('expensive')
df_word = df.set_index('datetime').resample('A-DEC')['term_of_interest'].sum()
df_word = df_word.reset_index()
print(df_word.sum())
df_word
term_of_interest 2889 dtype: int64
datetime | term_of_interest | |
---|---|---|
0 | 2010-12-31 | 17 |
1 | 2011-12-31 | 207 |
2 | 2012-12-31 | 314 |
3 | 2013-12-31 | 489 |
4 | 2014-12-31 | 422 |
5 | 2015-12-31 | 438 |
6 | 2016-12-31 | 370 |
7 | 2017-12-31 | 301 |
8 | 2018-12-31 | 289 |
9 | 2019-12-31 | 42 |
2013 has most references to 'expensive,' yet we also know that the total number of reviews per year can differ quite a bit. We can address this issue by assesing the average count of a term per review.
# Get the total number of reviews per year
df_review = df.set_index('datetime').resample('A-DEC')['count'].sum()
df_review = df_review.reset_index()
print(df_review.sum())
df_review
count 12694 dtype: int64
datetime | count | |
---|---|---|
0 | 2010-12-31 | 40 |
1 | 2011-12-31 | 609 |
2 | 2012-12-31 | 1316 |
3 | 2013-12-31 | 1506 |
4 | 2014-12-31 | 1634 |
5 | 2015-12-31 | 2164 |
6 | 2016-12-31 | 1954 |
7 | 2017-12-31 | 1736 |
8 | 2018-12-31 | 1479 |
9 | 2019-12-31 | 256 |
# Merge the two dataframes
# This can be done in a more simple way, but the method below allows you to merge more than two dataframesdfs = [df_word, df_review]
from functools import reduce
dfs = [df_word, df_review]
df_merge = reduce(lambda left,right: pd.merge(left,right,on=['datetime'],
how='left'), dfs)
df_merge
datetime | term_of_interest | count | |
---|---|---|---|
0 | 2010-12-31 | 17 | 40 |
1 | 2011-12-31 | 207 | 609 |
2 | 2012-12-31 | 314 | 1316 |
3 | 2013-12-31 | 489 | 1506 |
4 | 2014-12-31 | 422 | 1634 |
5 | 2015-12-31 | 438 | 2164 |
6 | 2016-12-31 | 370 | 1954 |
7 | 2017-12-31 | 301 | 1736 |
8 | 2018-12-31 | 289 | 1479 |
9 | 2019-12-31 | 42 | 256 |
# Finally, we devide the variable 'term_of_interest' by 'count' (the total number of reviews)
df_merge['term/document'] = df_merge['term_of_interest']/df_merge['count']
df_merge
datetime | term_of_interest | count | term/document | |
---|---|---|---|---|
0 | 2010-12-31 | 17 | 40 | 0.425000 |
1 | 2011-12-31 | 207 | 609 | 0.339901 |
2 | 2012-12-31 | 314 | 1316 | 0.238602 |
3 | 2013-12-31 | 489 | 1506 | 0.324701 |
4 | 2014-12-31 | 422 | 1634 | 0.258262 |
5 | 2015-12-31 | 438 | 2164 | 0.202403 |
6 | 2016-12-31 | 370 | 1954 | 0.189355 |
7 | 2017-12-31 | 301 | 1736 | 0.173387 |
8 | 2018-12-31 | 289 | 1479 | 0.195402 |
9 | 2019-12-31 | 42 | 256 | 0.164062 |
This shows that the relative use of the term of interest decreases with time