Skip to content

Latest commit

 

History

History
208 lines (167 loc) · 7.4 KB

readme.md

File metadata and controls

208 lines (167 loc) · 7.4 KB

Beyoncé and Taylor Swift Lyrics

The data this week comes from Rosie Baillie and Dr. Sara Stoudt.

Beyoncé's top 100 - Billboard. Taylor Swift's top 100 - Billboard.

Rosie put together a wonderful analysis of Taylor Swift lyrics! Can you do some similar work with Beyoncé's work?

Text analysis guides in tidytext or Supervised Machine Learning for Text Analysis in R.

The beyonce palettes R pkg.

Get the data here

# Get the Data

# Read in with tidytuesdayR package 
# Install from CRAN via: install.packages("tidytuesdayR")
# This loads the readme and all the datasets for the week of interest

# Either ISO-8601 date or year/week works!

tuesdata <- tidytuesdayR::tt_load('2020-09-29')
tuesdata <- tidytuesdayR::tt_load(2020, week = 40)

beyonce_lyrics <- tuesdata$beyonce_lyrics

# Or read in the data manually

beyonce_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/beyonce_lyrics.csv')
taylor_swift_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/taylor_swift_lyrics.csv')
sales <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/sales.csv')
charts <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/charts.csv')

Data Dictionary

beyonce_lyrics.csv

variable class description
line character Lyric line
song_id double Song ID
song_name character Song Name
artist_id double Artist ID
artist_name character Artist Name
song_line double Song line number

taylor_swift_lyrics.csv

variable class description
Artist character Artist
Album character Album name
Title character Title of song
Lyrics character Lyrics

sales.csv

variable class description
artist character Artist name
title character Song title
country character Country for sales
sales double Sales in dollars
released character released date
re_release character Re-released date
label character Label released under
formats character Formats released as

charts.csv

variable class description
artist character Artist name
title character Song title
released character released date
re_release character Re-released date
label character Label released under
formats character Formats released as
chart character Country Chart
chart_position character Highest Chart position

Cleaning Script

library(tidyverse)
library(rvest)

ts_url <- "https://en.wikipedia.org/wiki/Taylor_Swift_discography"

raw_ts_html <- ts_url %>% 
  read_html()

ts_raw <- raw_ts_html %>% 
  html_node("#mw-content-text > div.mw-parser-output > table:nth-child(10)") %>% 
  html_table(fill = TRUE) %>% 
  data.frame() %>% 
  janitor::clean_names() %>% 
  tibble() %>% 
  slice(-1, -nrow(.)) %>% 
  mutate(album_details = str_split(album_details, "\n"),
         sales = str_split(sales, "\n"),
  ) %>% 
  select(-certifications) %>% 
  unnest_longer(album_details)  %>% 
  separate(album_details, into = c("album_detail_type", "album_details"), sep = ": ") %>% 
  mutate(album_detail_type = if_else(album_detail_type == "Re-edition", "Re-release", album_detail_type)) %>% 
  pivot_wider(names_from = album_detail_type, values_from = album_details) %>% 
  select(-`na`) %>% 
  janitor::clean_names() 

ts_sales <- ts_raw %>% 
  unnest_longer(sales) %>% 
  separate(sales, into = c("country", "sales"), sep = ": ") %>% 
  mutate(sales = str_trim(sales),
         sales = parse_number(sales)) %>% 
  select(title, country, sales, released:formats) %>% 
  mutate(artist = "Taylor Swift", .before = title)


ts_chart <- ts_raw %>% 
  select(title, released:formats, contains("peak_chart")) %>% 
  pivot_longer(cols = contains("peak_chart"), names_to = "chart", values_to = "chart_position") %>% 
  mutate(
    chart = str_remove(chart, "peak_chart_positions"),
  chart = case_when(
    chart == "" ~ "US",
    chart == "_1" ~ "AUS",
    chart == "_2" ~ "CAN",
    chart == "_3" ~ "FRA",
    chart == "_4" ~ "GER",
    chart == "_5" ~ "IRE",
    chart == "_6" ~ "JPN",
    chart == "_7" ~ "NZ",
    chart == "_8" ~ "SWE",
    chart == "_9" ~ "UK",
    TRUE ~ NA_character_
  )
  )  %>% 
  mutate(artist = "Taylor Swift", .before = title)


# Beyonce -----------------------------------------------------------------


bey_url <- "https://en.wikipedia.org/wiki/Beyonc%C3%A9_discography"

raw_bey_html <- bey_url %>% 
  read_html()

bey_raw <- raw_bey_html %>% 
  html_node("#mw-content-text > div.mw-parser-output > table:nth-child(14)") %>% 
  #mw-content-text > div.mw-parser-output > table:nth-child(14) > tbody > tr:nth-child(3) > th > i > a
  html_table(fill = TRUE) %>% 
  data.frame() %>% 
  janitor::clean_names() %>% 
  tibble() %>% 
  slice(-1, -nrow(.)) %>% 
  mutate(album_details = str_split(album_details, "\n"),
         sales = str_split(sales, "\n"),
  ) %>% 
  select(-certifications) %>% 
  unnest_longer(album_details)  %>% 
  separate(album_details, into = c("album_detail_type", "album_details"), sep = ": ") %>% 
  mutate(album_detail_type = if_else(album_detail_type == "Re-edition", "Re-release", album_detail_type)) %>% 
  pivot_wider(names_from = album_detail_type, values_from = album_details) %>% 
  janitor::clean_names() 

bey_sales <- bey_raw %>% 
  unnest_longer(sales) %>% 
  separate(sales, into = c("country", "sales"), sep = ": ") %>% 
  mutate(sales = str_trim(sales),
         sales = parse_number(sales)) %>% 
  select(title, country, sales, released:label, formats = format)  %>% 
  mutate(artist = "Beyoncé", .before = title)

bey_chart <- bey_raw %>% 
  select(title, released:label, formats = format, contains("peak_chart")) %>% 
  pivot_longer(cols = contains("peak_chart"), names_to = "chart", values_to = "chart_position") %>% 
  mutate(
    chart = str_remove(chart, "peak_chart_positions"),
    chart = case_when(
      chart == "" ~ "US",
      chart == "_1" ~ "AUS",
      chart == "_2" ~ "CAN",
      chart == "_3" ~ "FRA",
      chart == "_4" ~ "GER",
      chart == "_5" ~ "IRE",
      chart == "_6" ~ "JPN",
      chart == "_7" ~ "NZ",
      chart == "_8" ~ "SWE",
      chart == "_9" ~ "UK",
      TRUE ~ NA_character_
    )
  ) %>% 
  mutate(artist = "Beyoncé", .before = title)

all_sales <- bind_rows(ts_sales, bey_sales)
all_charts <- bind_rows(ts_chart, bey_chart)

write_csv(all_sales, "2020/2020-09-29/sales.csv")
write_csv(all_charts, "2020/2020-09-29/charts.csv")