-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_analysis.R
587 lines (492 loc) · 21.2 KB
/
twitter_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
# Twitter Data Analysis
# This R script allows users to perform retweet network analysis, structural topic modelling (STM), hashtag analysis, and text analysis on a Twitter dataset.
# Clean the environment
rm(list = ls())
# Set working directory
# Update this path to your working directory
# setwd("/path/to/your/directory")
# Load required libraries
# --- For Data Handling and Wrangling ---
library(data.table)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
# --- Text Processing and Analysis ---
library(tidytext)
library(quanteda)
library(tm)
library(tokenizers)
library(stopwords)
library(SentimentAnalysis)
library(DT)
# --- Statistical Modelling and Topic Modelling ---
library(lavaan)
library(stm)
library(topicmodels)
# --- Social Media APIs and Network Analysis ---
library(rtweet)
library(twitteR)
library(ROAuth)
library(graphTweets)
library(igraph)
library(tidygraph)
library(visNetwork)
library(quanteda.textplots)
library(vosonSML)
library(textnets)
library(gephi)
# --- Data Visualisation ---
library(ggplot2)
library(ggthemes)
library(RColorBrewer)
library(gganimate)
library(plotly)
library(cowplot)
library(forcats)
library(graphlayouts)
library(ggraph)
library(formattable)
library(wordcloud)
library(networkD3)
# --- Miscellaneous Utilities ---
library(rvest)
library(syuzhet)
library(magrittr)
library(extrafont)
library(summarytools)
library(devtools)
library(stringi)
library(scales)
# --- How to login (if not using Twarc) ---
#Put your keys within the quote marks, copy them from the developer portal of Twitter https://developer.twitter.com/en/portal/dashboard
#Set up connection with Twitter developer app
reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
consumerKey <- ""
consumerSecret <- "" #Same as above
twitCred <- OAuthFactory$new(consumerKey=consumerKey,
consumerSecret=consumerSecret,
requestURL=reqURL,accessURL=accessURL,
authURL=authURL)
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")
twitCred$handshake(cainfo="cacert.pem")
#Set up authorisation
options(httr_oauth_cache=T)
api_key <- ""
api_secret <- ""
access_token <- ""
access_token_secret <- ""
setup_twitter_oauth (api_key,api_secret,access_token,access_token_secret)
# --- Set query from R (if not using third party collection interface)
q <- "#MeToo"
streamtime <- 60 * 60 * 24
filename <- "tweets.json"
Mar31 <- stream_tweets(q = q, timeout = streamtime, file_name = filename,parse = FALSE)
# --- Load data ---
# Replace these paths with your own datasets
tweets_data <- read.csv("your_dataset.csv")
# --- Cleaning data ---
# Change name of certain columns
colnames(tweets_data)[colnames(tweets_data) == "referenced_tweets.replied_to.id"] ="referenced_id"
# Subset the data to only include tweets containing a certain language eg. German ("de")
german_tweets <- tweets_data %>%
filter(lang == "de")
# Filter location manually
tweets_data <- tweets_data %>%
filter(author.location != "Here and there")
# Remove tweets containing certain authors or words
tweets_data <- tweets_data %>%
filter(!grepl('bitcoin', author.username)) #Replace column name to 'text' to search and remove tweets containing certain terms
# Change names of locations
tweets_data$author.location[tweets_data$author.location=="Italia"]<-"Italy"
# --- Descriptive analysis ---
#Total tweets
total_tweet_number <- tweets_data %>%
filter(!is_retweet) %>%
pull(status_id) %>%
unique() %>%
length()
# Total retweets
total_retweet_number <- tweets_data %>%
filter(is_retweet) %>%
pull(status_id) %>%
unique() %>%
length()
#Most retweeted tweets
tweets_data %>%
arrange(-retweet_count) %>%
slice(10) %>% #Adjust value
select(created_at, screen_name, text, retweet_count)
# Top author locations
tweets_data %>%
count(author.location, sort = TRUE) %>%
filter(author.location != " ") %>%
mutate(author_location = reorder(author.location, n)) %>%
na.omit() %>%
top_n(10) #Adjust value
# Unique Twitter accounts in the sample
length(unique(tweets_data$screen_name))
#top tweeters
tweets_data %>%
count(author.username, sort = TRUE) %>%
top_n(10) %>% #Adjust value
mutate(author.username = paste0("@", author.username))
# How many tweets by language - create pie chart with slice labels
plotdata <- tweets_data %>%
filter(lang=="de" | lang== "pl" | lang=="it") %>%
count(lang) %>%
arrange(desc(lang)) %>%
mutate(prop = round(n*100/sum(n), 1),
lab.ypos = cumsum(prop) - 0.5*prop)
plotdata$lang <- as.character(plotdata$lang)
plotdata$lang[plotdata$lang == "pl"] <- "Polish"
plotdata$lang[plotdata$lang == "it"] <- "Italian"
plotdata$lang[plotdata$lang == "de"] <- "German"
plotdata$label <- paste0(plotdata$lang, "\n", plotdata$n, "\n",
round(plotdata$prop), "%")
png("LanguageProportion.png", width=11, height=6, units="in", res=720)
ggplot(plotdata,
aes(x = "",
y = prop,
fill = lang)) +
geom_bar(width = 1,
stat = "identity", color = NA) +
geom_text(aes(y = lab.ypos, label = label),
color = "white", family="Arial Black") +
coord_polar("y",
start = 0) +
scale_fill_manual(values=c("#5886a5", "#004c6d", "#9dc6e0"), labels = c("Germany", "Italy", "Poland"))+
theme_tufte() +
theme(legend.position = "FALSE", text = element_text(family="mono", face="bold", size=12)) +
labs(title = "Total tweets: 53,365")
dev.off()
# --- Retweet/Quote/Reply Networks ---
# Add original tweets author's name based on user ID
users_info <- lookup_users(as_userid(tweets_data$retweeted_user_id)) # Replace with quoted_user_is for quote networks and in_reply_to_user_id for reply networks
users_info$user_id <- as.double(users_info$user_id) #Convert variable class in case of errors
tweets_data <- tweets_data %>%
left_join(
select(users_info, retweeted_user_id = user_id, screen_name),
by = "retweeted_user_id"
)
# Create retweet network graph
retweet_graph <- tweets_data %>%
filter(public_metrics.retweet_count > 0) %>%
select(author.username, screen_name) %>%
unnest(screen_name) %>%
filter(!is.na(screen_name)) %>%
graph_from_data_frame()
# Summarise the retweet graph
summary(retweet_graph)
# Export the graph for Gephi
write_graph(simplify(retweet_graph), "retweet_network.gml", format = "gml")
# --- Friends networks of top users --- Mutual followers among the 100 most retweeted users
#Create subset of 100 most tweeted users
topusers <- tweets_data %>%
arrange(-retweet_count) %>%
distinct(author.username, .keep_all = TRUE) %>%
slice(1:100)
topusers %>% select(author.username) %>% unique() %>% arrange(author.username) %>% DT::datatable()
users_info <- lookup_users(topusers$author.username)
users_info %>% arrange(-followers_count) %>% head()
users_info %>% select(screen_name) %>% unique() %>% arrange(screen_name) %>% DT::datatable()
# Calculate percentile-based scores for various metrics (followers, friends, etc.)
users_ranking <- users_info %>%
filter(protected == FALSE) %>%
select(screen_name, followers_count, friends_count, listed_count, favourites_count, statuses_count) %>%
mutate(
followers_percentile = ecdf(followers_count)(followers_count),
friends_percentile = ecdf(friends_count)(friends_count),
listed_percentile = ecdf(listed_count)(listed_count),
favourites_percentile = ecdf(favourites_count)(favourites_count),
statuses_percentile = ecdf(statuses_count)(statuses_count)
) %>%
group_by(screen_name) %>%
summarise(top_score = sum(followers_percentile, friends_percentile, listed_percentile, favourites_percentile, statuses_percentile)) %>%
ungroup() %>%
mutate(ranking = rank(-top_score))
# Select the top 30 users based on their calculated scores
top_30 <- users_ranking %>% arrange(desc(top_score)) %>% head(30)
top_30_users_ranking <- top_30 %>% select(screen_name) %>% DT::datatable()
# Save the ranking of top 30 users to an HTML and PNG file
saveWidget(top_30_users_ranking, "users_ranking.html")
webshot("users_ranking.html", "top_30_users_ranking.png")
# Get friends list for each top user in groups to avoid hitting rate limits
usernames <- users_info$screen_name
# Extract friends in batches of 15 users at a time to avoid rate limit
extract_friends_batch <- function(start_index, end_index) {
Sys.sleep(15 * 60) # Wait for 15 minutes to avoid rate limit
friends_batch <- purrr::map(usernames[start_index:end_index], get_friends)
names(friends_batch) <- usernames[start_index:end_index]
return(friends_batch)
}
# Extract friends for all 100 users in batches
friends_top100 <- c(
extract_friends_batch(1, 15),
extract_friends_batch(16, 30),
extract_friends_batch(31, 45),
extract_friends_batch(46, 60),
extract_friends_batch(61, 75),
extract_friends_batch(76, 90),
extract_friends_batch(91, 100)
)
# Save the friends list to a CSV file
write.list(z = friends_top100, file = "friends_top100.csv", t.name = NULL, row.names = FALSE)
# Merge the friends list with the top 100 user data
friends_top100_df <- map2_df(friends_top100, names(friends_top100), ~ mutate(.x, twitter_top_user = .y)) %>%
rename(friend_id = user_id) %>%
filter(friend_id %in% users_info$user_id) %>%
mutate(friend_name = users_info$screen_name[match(friend_id, users_info$user_id)])
# Convert to an igraph object
network <- graph_from_data_frame(friends_top100_df, directed = TRUE)
# Add "Popularity" (degree centrality) as a node attribute
V(network)$Popularity <- degree(network, mode = 'in')
# Create a tidygraph object for easier plotting
tg <- as_tbl_graph(network) %>%
activate(nodes) %>%
mutate(label = name)
# Calculate Eigenvector Centrality to determine influence
eigenCent <- evcent(network)$vector
colorVals <- rev(heat.colors(30))[cut(eigenCent, 30, labels = FALSE, include.lowest = TRUE)]
# Plot the network
png("Top100_MutualFriendsNetwork.png", width = 18, height = 12, units = "in", res = 720)
tg %>%
ggraph(layout = "stress") +
geom_edge_diagonal(alpha = .2, color = 'white') +
geom_node_point(size = log(V(network)$Popularity) * 2, color = colorVals) +
geom_node_text(aes(label = label, size = log(V(network)$Popularity)), repel = TRUE) +
labs(
title = "Influential Mutual Friends: Do the Most Retweeted Users Follow Each Other?",
subtitle = "Popularity (node size) and influence (node color) among top 100 most retweeted users in the dataset."
) +
theme_graph() +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
dev.off()
# --- User-hashtag networks ---
# Function to create user-hashtag network: users connected by shared hashtag usage
create_user_hashtag_network <- function(data, network_name, min_hashtag_freq = 5, min_user_hashtags = 3, min_edge_weight = 2) {
# Extract hashtags for each user
user_hashtags <- data %>%
mutate(hashtags = str_extract_all(text, "#\\w+")) %>%
unnest(hashtags) %>%
select(User.Name, hashtags) %>%
distinct() # Remove duplicates if a user uses the same hashtag multiple times
# Filter hashtags by frequency
frequent_hashtags <- user_hashtags %>%
count(hashtags) %>%
filter(n >= min_hashtag_freq) %>%
pull(hashtags)
# Filter user_hashtags to only include frequent hashtags
user_hashtags <- user_hashtags %>%
filter(hashtags %in% frequent_hashtags)
# Filter users by the number of unique hashtags they have used
active_users <- user_hashtags %>%
group_by(User.Name) %>%
summarise(hashtag_count = n_distinct(hashtags), .groups = "drop") %>%
filter(hashtag_count >= min_user_hashtags) %>%
pull(User.Name)
# Filter user_hashtags to only include active users
user_hashtags <- user_hashtags %>%
filter(User.Name %in% active_users)
# Create edges between users who share hashtags
edges <- user_hashtags %>%
inner_join(user_hashtags, by = "hashtags") %>%
filter(User.Name.x != User.Name.y) %>% # Avoid self-loops
group_by(User.Name.x, User.Name.y) %>%
summarise(weight = n(), .groups = "drop") %>% # Count shared hashtags
filter(weight >= min_edge_weight) %>% # Only keep strong edges
distinct() # Remove duplicate edges
# Ensure each pair is unique and undirected
edges <- edges %>%
mutate(User1 = pmin(User.Name.x, User.Name.y),
User2 = pmax(User.Name.x, User.Name.y)) %>%
select(User1, User2, weight) %>%
distinct()
# Create graph from the edges
graph <- graph_from_data_frame(edges, directed = FALSE)
# Add node attribute for total hashtags used by each user
nodes <- user_hashtags %>%
group_by(User.Name) %>%
summarise(hashtag_count = n(), .groups = "drop") %>%
rename(Id = User.Name)
# Export edges and nodes to CSV for Gephi
write_csv(edges, paste0(network_name, "_edges.csv"))
write_csv(nodes, paste0(network_name, "_nodes.csv"))
return(graph)
}
# Example usage for Spain and Italy networks
user_hashtag_network <- create_user_hashtag_network(
tweets_data,
"user_hashtag_network",
min_hashtag_freq = 5,
min_user_hashtags = 3,
min_edge_weight = 3
)
# --- User co-occurrence network
#Extract most frequently mentioned usernames
user_dfm <- dfm_select(tweet_dfm, pattern = "@*")
user_top <- names(topfeatures(user_dfm, 100))
head(user_top)
#Construct feature-occurrence matrix of usernames
user_top_fcm <- fcm(user_top)
head(user_top_fcm)
#plot
user_top_fcm <- fcm_select(user_top_fcm, pattern = user_top)
jpeg("Top100UserCoccurence.jpeg", width=18, height=12, units="in", res=720)
textplot_network(user_top_fcm, min_freq = 0.4, edge_color = "mediumpurple3", edge_alpha = 0.8, edge_size = 3)
dev.off()
# --- Two-mode networks ---
# Prepare tweet corpus by each author to prepare discourse network. Nodes will be authors, with edges drawn according to the overlap of words used in their tweets
twomode_data <- tweets_data %>% select(text, author.username) %>% group_by(author.username)
twomode_data <- textnets::PrepText(twomode_data, groupvar = "author.username", textvar = "text", node_type = "groups", tokenizer = "tweets", strip_url = TRUE, pos = "nouns", remove_stop_words = TRUE, compound_nouns = FALSE)
twomode_network <- CreateTextnet(twomode_data)
graph_centrality <- TextCentrality(twomode_network) #Analyse centrality
graph_communities <- TextCommunities(twomode_network) #Louvain community detection algorithm
VisTextNet(twomode_network, label_degree_cut = 0) #Visualise network
gephi_write_edges(twomode_network, "edges.csv") #Or export for Gephi
# --- Structural Topic Modelling (STM) ---
# Convert the tweet texts into a corpus and perform topic modelling
tweet_corpus <- corpus(tweets_data$text)
tweet_dfm <- tokens(tweet_corpus, remove_numbers = TRUE, remove_punct = TRUE, remove_url = TRUE)
tweet_dfm <- tokens_remove(tweet_dfm, stopwords("english")) #Replace with language of tweet corpus
tweet_dfm <- dfm(tweet_dfm)
tweet_dfm <- dfm_trim(tweet_dfm, max_docfreq = 0.50, min_docfreq = 0.01, docfreq_type = 'prop') #Adjust frequencies
# Convert to STM-compatible format
stm_data <- quanteda::convert(tweet_dfm, to = "stm", docvars=tweets_data)
# Run STM model with 20 topics
stm_model <- stm(
documents = stm_data$documents,
vocab = stm_data$vocab,
data = stm_data$meta,
init.type = 'Spectral',
prevalence = ~author.public_metrics.followers_count,
K = 20,
verbose = FALSE
)
# Save the STM topic model plot
png("Top_Topic_Model.png", width=16, height=10, units="in", res=300)
plot(stm_model, n = 10)
dev.off()
# Correlation between STM topics
cor <- topicCorr(stm_model, verbose = TRUE)
png("CorrelationBetweenTopics.png", width=12, height=10, units="in", res=300)
plot(cor)
dev.off()
# FindThoughts - find tweets carrying these topics
thoughts <- findThoughts(stm_model, texts = newgercorpus, n = 5, topics = 10) #Adjust values of n for number of tweets and topics for number of topics from which to pull these tweets
png("ExampleTweets.png", width=10, height=12, units="in", res=300)
plot(thoughts)
dev.off()
# --- Sentiment analysis ---
# Calculate sentiment score
score <- get_sentiment(tweet_corpus, method="syuzhet", language = "german") #Change according to your language
sentiments_df <- data.frame(score)
text <- VectorSource(tweet_corpus$text)
timestamp <- as.POSIXct(sapply(text, function(x)x$getCreated()), origin="1970-01-01", tz="GMT")
plot_ly(sentiments_df, x=~timestamp, y=~score, type="scatter", mode="jitter", name="syuzhet") %>%
add_trace(y=~bing, mode="lines", name="bing") %>%
add_trace(y=~afinn, mode="lines", name="afinn") %>%
add_trace(y=~nrc, mode="lines", name="nrc") %>%
layout(title="Sentiment scores",
yaxis=list(title="score"), xaxis=list(title="date"))
# Distribution of emotions in tweets
emotions <- get_nrc_sentiment(tweet_corpus, language = "german") #Change according to your language
emo_bar <- colSums(emotions)
emo_sum <- data.frame(count=emo_bar, emotion=names(emo_bar))
emo_sum$emotion <- factor(emo_sum$emotion, levels=emo_sum$emotion[order(emo_sum$count, decreasing = TRUE)])
colnames(emo_sum) <- c("WordCount","Emotions")
png("Emotions.png", width=25, height=13, units="in", res=720) #Visualise
emo_sum %>%
ggplot(aes(x = reorder(Emotions, WordCount), y = WordCount, fill=Emotions)) +
geom_col() +
scale_fill_manual(values = harrypotter(n=10, alpha = .5, option = "Gryffindor", direction=-1), name = "")+
labs(x="Emotions", y="Word Count",
caption = "Data collected from Twitter API for Academic Research",
title = "Distribution of emotions in tweets")+
coord_flip()+
scale_y_continuous(expand = c(0, 0))+
theme(plot.title = element_text(family = "Gill Sans", size = 28, face = "bold", hjust = 0.1, vjust = 1.2),
axis.text = element_text(family = "Arial", size = 18),
axis.title.x = element_text(family = "Helvetica", size = 18, face = "bold", vjust = -1),
axis.title.y = element_text(family = "Helvetica", size = 18, face = "bold"),
plot.caption = element_text(size = 12, face = "italic", hjust = 1.1),
legend.title = element_text(family = "Helvetica", size = 18, face = "bold"),
legend.text = element_text(family = "Arial", size = 18),
panel.background = element_blank())
dev.off()
# --- Textual Analysis ---
# Most Retweeted Tweets
most_retweeted <- tweets_data %>%
arrange(-public_metrics.retweet_count) %>% #Can do the same for likes count, quotes count
slice(1:20) %>%
select(text, public_metrics.retweet_count, date, author.username, author.description)
write.csv(most_retweeted, "most_retweeted.csv")
# Hashtag Analysis
extract.hashes = function(vec) {
hash.pattern = "#[[:alpha:]]+"
have.hash = grep(x = vec, pattern = hash.pattern)
hash.matches = gregexpr(pattern = hash.pattern, text = vec[have.hash])
extracted.hash = regmatches(x = vec[have.hash], m = hash.matches)
df = data.frame(table(tolower(unlist(extracted.hash))))
colnames(df) = c("hashtag", "freq")
df = df[order(df$freq, decreasing = TRUE),]
return(df)
}
# Top 50 hashtags from the dataset
hashtags <- head(extract.hashes(tweets_data$text), 50) #Adjust number to select desired number of top hashtags
write.csv(hashtags, "top_hashtags.csv")
# Alternative method to count top tweets
tweets_data %>%
unnest_tokens(hashtag, text, "tweets", to_lower = FALSE) %>%
filter(str_detect(hashtag, "^#")) %>%
count(hashtag, sort = TRUE)%>%
top_n(50) #Adjust value
# Function to check if a tweet starts with @username to capture replies and add a column containing the username if that is the case
starts_with_mention <- function(text) {
# Allow for any whitespace or punctuation before @username
str_detect(text, "^[\\s.,;:()-]*@[A-Za-z0-9_]+")
}
tweets_data <- tweets_data %>%
mutate(
starts_with_at = starts_with_mention(text),
first_word = word(str_trim(text), 1)
)
tweets_data <- tweets_data %>%
mutate(
starts_with_at = starts_with_mention(text),
first_word = word(str_trim(text), 1)
)
# N-grams Analysis
tweet_bigrams_data <- data.frame("texts" = as.character(tweets_data$text), stringsAsFactors = FALSE)
tweet_bigrams_data <- as.data.frame(removeWords(tweet_bigrams_data$texts, stopwords("english")))
# Unigram analysis
unigrams <- tweet_bigrams_data %>%
unnest_tokens(output = unigram, input = texts, token = "ngrams", n = 1)
unigram_count <- unigrams %>%
group_by(unigram) %>%
summarise(n = n()) %>%
mutate(freq = formattable::percent(n / sum(n))) %>%
top_n(100) %>%
arrange(desc(n))
write.csv(unigram_count, "unigrams.csv")
# Bigram and Trigram analysis can be performed similarly.
# --- Time Series Analysis ---
tweets_data$date <- as.POSIXct(tweets_data$created_at, tz = "GMT", format = "%Y-%m-%d")
png("timeseries.png", width=7, height=8, units="in", res=720)
tweets_data %>%
mutate(Date = as_date(date)) %>%
group_by(Date) %>%
summarise(count = n_distinct(text)) %>%
ggplot(aes(x = Date, y = count)) +
geom_line() +
labs(title = "Number of Tweets Over Time", x = "Date", y = "Number of Tweets") +
theme_minimal() +
theme(panel.background = element_rect(fill = "white"),
panel.grid.major.x = element_line(size = 0.25, linetype = 'dashed', colour = "grey"),
plot.caption = element_text(size = 7.5, face = "italic", color = "dark grey"))
dev.off()
# --- End of Script ---
# This script can be used to replicate the analysis on various datasets by replacing the data source.
# Ensure that file paths, data formats, and variables are adjusted according to the specific dataset being used.