Huge thanks to Alex Cookson who provided ALL of this week's data, cleaning script, and readme!
Check out his recent blog post on the same data.
This data comes from Playbill. Weekly box office grosses comprise data on revenue and attendance figures for theatres that are part of The Broadway League, an industry association for, you guessed it, Broadway theatre.
CPI data is from the U.S. Bureau of Labor Statistics. There are many, many measures of CPI, so the one used here is "All items less food and energy in U.S. city average, all urban consumers, seasonally adjusted" (table CUSR0000SA0L1E).
Check out all of the raw data and other details on Alex's GitHub
# Get the Data
grosses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/grosses.csv', guess_max = 40000)
synopses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/synopses.csv')
cpi <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/cpi.csv')
pre_1985_starts <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-28/pre-1985-starts.csv')
# Or read in with tidytuesdayR package (https://github.com/dslc-io/tidytuesdayR)
# PLEASE NOTE TO USE 2020 DATA YOU NEED TO USE the tidytuesdayR version after Jan 2020.
# Either ISO-8601 date or year/week works!
# Install via pak::pak("dslc-io/tidytuesdayR")
tuesdata <- tidytuesdayR::tt_load('2020-04-28')
tuesdata <- tidytuesdayR::tt_load(2020, week = 18)
grosses <- tuesdata$grosses
variable | class | description |
---|---|---|
week_ending | date | Date of the end of the weekly measurement period. Always a Sunday. |
week_number | double | Week number in the Broadway season. The season starts after the Tony Awards, held in early June. Some seasons have 53 weeks. |
weekly_gross_overall | double | Weekly box office gross for all shows |
show | character | Name of show. Some shows have the same name, but multiple runs. |
theatre | character | Name of theatre |
weekly_gross | double | Weekly box office gross for individual show |
potential_gross | double | Weekly box office gross if all seats are sold at full price. Shows can exceed their potential gross by selling premium tickets and/or standing room tickets. |
avg_ticket_price | double | Average price of tickets sold |
top_ticket_price | double | Highest price of tickets sold |
seats_sold | double | Total seats sold for all performances and previews |
seats_in_theatre | double | Theatre seat capacity |
pct_capacity | double | Percent of theatre capacity sold. Shows can exceed 100% capacity by selling standing room tickets. |
performances | double | Number of performances in the week |
previews | double | Number of preview performances in the week. Previews occur before a show's official open. |
variable | class | description |
---|---|---|
show | character | Name of show |
synopsis | character | Plot synopsis of show. Contains some missing values, especially for shows with multiple runs (due to how the data was collected). |
variable | class | description |
---|---|---|
year_month | date | Month of CPI value |
cpi | double | Consumer Price Index value for the given month |
variable | class | description |
---|---|---|
week_ending | date | Date of the end of the weekly measurement period |
show | character | Name of show |
run_start_week | date | Starting week for shows that premiered before 1985-06-08 (the start of the dataset) |
# Load packages
library(tidyverse)
library(lubridate)
library(rvest)
# Weekly grosses
### Create function to scrape grosses table
get_playbill_data = function(url) {
message(url)
website <- read_html(url)
show_stats <- list(
week_number = html_nodes(website, ".week-count .accent") %>% html_text(trim = TRUE),
weekly_gross_overall = html_nodes(website, ".week-total .accent") %>% html_text(trim = TRUE),
show = html_nodes(website, ".col-0 .data-value") %>% html_text(trim = TRUE),
theatre = html_nodes(website, ".col-0 .subtext") %>% html_text(trim = TRUE),
weekly_gross = html_nodes(website, ".col-1 .data-value") %>% html_text(trim = TRUE),
potential_gross = html_nodes(website, "td.col-1 .subtext") %>% html_text(trim = TRUE),
avg_ticket_price = html_nodes(website, ".col-3 .data-value") %>% html_text(trim = TRUE),
top_ticket_price = html_nodes(website, "td.col-3 .subtext") %>% html_text(trim = TRUE),
seats_sold = html_nodes(website, ".col-4 .data-value") %>% html_text(trim = TRUE),
seats_in_theatre = html_nodes(website, "td.col-4 .subtext") %>% html_text(trim = TRUE),
pct_capacity = html_nodes(website, ".col-6 .data-value") %>% html_text(trim = TRUE),
performances = html_nodes(website, ".col-5 .data-value") %>% html_text(trim = TRUE),
previews = html_nodes(website, "td.col-5 .subtext") %>% html_text(trim = TRUE)
)
tibble(show_stats = show_stats) %>%
mutate(variable_name = names(show_stats)) %>%
pivot_wider(names_from = variable_name, values_from = show_stats) %>%
unnest(cols = everything())
}
### Create tibble with list of URLs and scrape data
### TAKES A LONG TIME (~10 HOURS)
broadway_grosses_raw <-
tibble(week_ending = seq(ymd("1985-06-09"), ymd("2020-03-01"), by = "1 week")) %>%
mutate(grosses_url = paste0("https://www.playbill.com/grosses?week=", week_ending)) %>%
mutate(week_data = map(grosses_url, possibly(get_playbill_data, NULL, quiet = FALSE)))
### Clean grosses data
broadway_grosses <- broadway_grosses_raw %>%
unnest(week_data, keep_empty = TRUE) %>%
mutate_at(vars(week_number:weekly_gross_overall, weekly_gross:previews),
parse_number) %>%
mutate(
pct_capacity = pct_capacity / 100,
show = stringi::stri_trans_general(show, "Latin-ASCII")
) %>%
mutate_at(vars(potential_gross, top_ticket_price), ~ ifelse(. == 0, NA, .)) %>%
select(-grosses_url)
### Write to CSV
broadway_grosses %>%
write_csv("./broadway-grosses/grosses.csv")
# Show synopses
### Create function to scrape show synopses
get_synopsis <- function(url) {
message(url)
read_html(url) %>%
html_nodes(".spotlight-search-result .bsp-list-promo-desc") %>%
html_text(trim = TRUE)
}
synopses_raw <- broadway_grosses %>%
distinct(show) %>%
mutate(
synopsis_url = paste0(
"https://www.playbill.com/searchpage/search?q=",
urltools::url_encode(show),
"&qasset="
),
synopsis = map(
synopsis_url,
possibly(get_synopsis, NA_character_, quiet = FALSE)
)
)
# Clean synopsis data
synopses <- synopses_raw %>%
select(-synopsis_url) %>%
unnest(cols = c(synopsis), keep_empty = TRUE)
### Write to CSV
synopses %>%
write_csv("./broadway-grosses/synopses.csv")