# Read the .CSV file as a dataframe 
import os
corpus_path = 'C:/Users/User/Downloads/DisneylandReviews.csv'  # Change this path to the preferred/relevant location on your computer
os.chdir(corpus_path)

import warnings
warnings.filterwarnings('ignore') # only use this when you know the script and want to supress unnecessary warnings

import pandas as pd
df = pd.read_csv("DisneylandReviews.csv", encoding='ISO-8859-1')
df.reset_index(level=0, inplace=True)
df


# Limit the data to Disneyland Paris
df = df[(df['Branch'] == 'Disneyland_Paris')]
# Drop rows if Year_Month is missing
df = df[df.Year_Month != 'missing']
df.reset_index(level=0, inplace=True)
df


# Extract the year of the visit # \d{4} is a pattern that matches with four digit numbers (which is useful to extract years from text)
df['year'] = df['Year_Month'].str.extract('(\d{4})', expand=True)

# Convert this string to a datevariable
df['datetime']  = pd.to_datetime(df['year'], errors = 'coerce')

# Add a count (this will be useful later when making the graphs)
df['count'] = 1
df

# Keep the columns that we need
df = df[['Review_Text','datetime', 'count']]
df


# Word counts for 'expensive' per year (you can resample by year, month, or day,i.e. 'A-DEC', 'M', or 'D') 
df['term_of_interest'] = df['Review_Text'].str.count('expensive')
df_word = df.set_index('datetime').resample('A-DEC')['term_of_interest'].sum()
df_word = df_word.reset_index()
print(df_word.sum())
df_word

term_of_interest    2889
dtype: int64


# Get the total number of reviews per year 
df_review = df.set_index('datetime').resample('A-DEC')['count'].sum()
df_review = df_review.reset_index()
print(df_review.sum())
df_review

count    12694
dtype: int64


# Merge the two dataframes 
# This can be done in a more simple way, but the method below allows you to merge more than two dataframesdfs = [df_word, df_review]
from functools import reduce
dfs = [df_word, df_review]
df_merge = reduce(lambda  left,right: pd.merge(left,right,on=['datetime'],
                                            how='left'), dfs)
df_merge


# Finally, we devide the variable 'term_of_interest' by 'count' (the total number of reviews)
df_merge['term/document'] = df_merge['term_of_interest']/df_merge['count']
df_merge

	index	Review_ID	Rating	Year_Month	Reviewer_Location	Review_Text	Branch
0	0	670772142	4	2019-4	Australia	If you've ever been to Disneyland anywhere you...	Disneyland_HongKong
1	1	670682799	4	2019-5	Philippines	Its been a while since d last time we visit HK...	Disneyland_HongKong
2	2	670623270	4	2019-4	United Arab Emirates	Thanks God it wasn t too hot or too humid wh...	Disneyland_HongKong
3	3	670607911	4	2019-4	Australia	HK Disneyland is a great compact park. Unfortu...	Disneyland_HongKong
4	4	670607296	4	2019-4	United Kingdom	the location is not in the city, took around 1...	Disneyland_HongKong
...	...	...	...	...	...	...	...
42651	42651	1765031	5	missing	United Kingdom	i went to disneyland paris in july 03 and thou...	Disneyland_Paris
42652	42652	1659553	5	missing	Canada	2 adults and 1 child of 11 visited Disneyland ...	Disneyland_Paris
42653	42653	1645894	5	missing	South Africa	My eleven year old daughter and myself went to...	Disneyland_Paris
42654	42654	1618637	4	missing	United States	This hotel, part of the Disneyland Paris compl...	Disneyland_Paris
42655	42655	1536786	4	missing	United Kingdom	I went to the Disneyparis resort, in 1996, wit...	Disneyland_Paris

	level_0	index	Review_ID	Rating	Year_Month	Reviewer_Location	Review_Text	Branch
0	29026	29026	670721950	5	2019-3	United Arab Emirates	We've been to Disneyland Hongkong and Tokyo, s...	Disneyland_Paris
1	29027	29027	670686565	4	2018-6	United Kingdom	I went to Disneyland Paris in April 2018 on Ea...	Disneyland_Paris
2	29028	29028	670606796	5	2019-4	United Kingdom	What a fantastic place, the queues were decent...	Disneyland_Paris
3	29029	29029	670586937	4	2019-4	Australia	We didn't realise it was school holidays when ...	Disneyland_Paris
4	29031	29031	670400930	5	2019-4	United Kingdom	Such a magical experience. I recommend making ...	Disneyland_Paris
...	...	...	...	...	...	...	...	...
12689	42113	42113	92198076	4	2011-1	United Kingdom	Although our pick up was prompt the taxi drive...	Disneyland_Paris
12690	42114	42114	92061774	4	2011-1	Germany	Just returned from a 4 days family trip to Dis...	Disneyland_Paris
12691	42115	42115	91995748	1	2010-12	United Kingdom	We spent the 20 Dec 2010 in the Disney park an...	Disneyland_Paris
12692	42116	42116	91984642	2	2010-12	United Kingdom	Well I was really looking forward to this trip...	Disneyland_Paris
12693	42117	42117	91827418	5	2010-9	United Kingdom	If staying at a Disney hotel make good use of ...	Disneyland_Paris

	Review_Text	datetime	count
0	We've been to Disneyland Hongkong and Tokyo, s...	2019-01-01	1
1	I went to Disneyland Paris in April 2018 on Ea...	2018-01-01	1
2	What a fantastic place, the queues were decent...	2019-01-01	1
3	We didn't realise it was school holidays when ...	2019-01-01	1
4	Such a magical experience. I recommend making ...	2019-01-01	1
...	...	...	...
12689	Although our pick up was prompt the taxi drive...	2011-01-01	1
12690	Just returned from a 4 days family trip to Dis...	2011-01-01	1
12691	We spent the 20 Dec 2010 in the Disney park an...	2010-01-01	1
12692	Well I was really looking forward to this trip...	2010-01-01	1
12693	If staying at a Disney hotel make good use of ...	2010-01-01	1

	datetime	term_of_interest
0	2010-12-31	17
1	2011-12-31	207
2	2012-12-31	314
3	2013-12-31	489
4	2014-12-31	422
5	2015-12-31	438
6	2016-12-31	370
7	2017-12-31	301
8	2018-12-31	289
9	2019-12-31	42

	datetime	count
0	2010-12-31	40
1	2011-12-31	609
2	2012-12-31	1316
3	2013-12-31	1506
4	2014-12-31	1634
5	2015-12-31	2164
6	2016-12-31	1954
7	2017-12-31	1736
8	2018-12-31	1479
9	2019-12-31	256

1. Open and prepare the dataset¶

2. Getting the words counts¶

	datetime	term_of_interest	count	term/document
0	2010-12-31	17	40	0.425000
1	2011-12-31	207	609	0.339901
2	2012-12-31	314	1316	0.238602
3	2013-12-31	489	1506	0.324701
4	2014-12-31	422	1634	0.258262
5	2015-12-31	438	2164	0.202403
6	2016-12-31	370	1954	0.189355
7	2017-12-31	301	1736	0.173387
8	2018-12-31	289	1479	0.195402
9	2019-12-31	42	256	0.164062