Andrew Sheveliev – R-posts.com

Most popular on Netflix, Disney+, Hulu and HBOmax. Weekly Tops for last 60 days

Couple months ago I published Most popular on Netflix. Daily Tops for last 60 days – small research based on daily scraping data answering following questions: How many movies (titles) made the Netflix Daily Tops? What movie was the longest #1 on Netflix? For how many days movies / TV shows stay in Tops and as #1? etc.
This time I am sharing analysis of the most popular movies / TV shows across Netflix, Disney+, Hulu and HBOmax on weekly basis, instead of daily, with anticipation of better trends catching.

So, let`s count how many movies made the top5, I assume it is less than 5 *60…

library(tidyverse)
library (gt)

platforms <- c('Disney+','HBOmax', 'Hulu', 'Netflix') # additionally, load CSV data using readr

Wrangle raw data – reverse (fresh date first), take top 5, take last 60 days

fjune_dt % rev () %>% slice (1:5) %>% select (1:60)
fdjune_dt % rev () %>% slice (1:5) %>% select (1:60)
hdjune_dt % rev () %>% slice (1:5) %>% select (1:60)
hulu_dt % rev () %>% slice (1:5) %>% select (1:60)

Gather it together and count the number of unique titles in Top5 for 60 days

fjune_dt_gathered <- gather (fjune_dt)
fdjune_dt_gathered <- gather (fdjune_dt)
hdjune_dt_gathered <- gather (hdjune_dt)
hulu_dt_gathered <- gather (hulu_dt)
unique_fjune_gathered % length ()
unique_fdjune_gathered % length ()
unique_hdjune_gathered % length ()
unique_hulu_gathered % length ()
unique_gathered <- c(unique_fdjune_gathered, unique_hdjune_gathered, unique_hulu_gathered, unique_fjune_gathered)
unique_gathered <- as.data.frame (t(unique_gathered), stringsAsFactors = F)
colnames (unique_gathered) <- platforms

Let`s make a nice table for the results

unique_gathered_gt %
tab_header(
  title = "Number of unique movies (titles) in Top5")%>%
  tab_style(
    style = list(
      cell_text(color = "purple")),
    locations = cells_column_labels(
      columns = vars(HBOmax)))%>%
  tab_style(
    style = list(
      cell_text(color = "green")),
    locations = cells_column_labels(
      columns = vars(Hulu))) %>%
  tab_style(
    style = list(
      cell_text(color = "red")),
    locations = cells_column_labels(
      columns = vars(Netflix)))
unique_gathered_gt

Using similar code we can count the number of unique titles which were #1 one or more days

What movie was the longest in Tops / #1?

table_fjune_top5 <- sort (table (fjune_dt_gathered$value), decreasing = T) # Top5
table_fdjune_top5 <- sort (table (fdjune_dt_gathered$value), decreasing = T)
table_hdjune_top5 <- sort (table (hdjune_dt_gathered$value), decreasing = T)
table_hulu_top5 <- sort (table (hulu_dt_gathered$value), decreasing = T)

Plotting the results

bb5fdjune <- barplot (table_fdjune_top5 [1:5], ylim=c(0,62), main = "Days in Top5, Disney+", las = 1, col = 'blue')
text(bb5fdjune,table_fdjune_top5 [1:5] +2,labels=as.character(table_fdjune_top5 [1:5]))
bb5hdjune <- barplot (table_hdjune_top5 [1:5], ylim=c(0,60), main = "Days in Top5, HBO Max", las = 1, col = 'grey', cex.names=0.7)
text(bb5hdjune,table_hdjune_top5 [1:5] +2,labels=as.character(table_hdjune_top5 [1:5]))
bb5hulu <- barplot (table_hulu_top5 [1:5], ylim=c(0,60), main = "Days in Top5, Hulu", las = 1, col = 'green')
text(bb5hulu,table_hulu_top5 [1:5] +2,labels=as.character(table_hulu_top5 [1:5]))
bb5fjune <- barplot (table_fjune_top5 [1:5], ylim=c(0,60), main = "Days in Top5, Netflix", las = 1, col = 'red')
text(bb5fjune,table_fjune_top5 [1:5] +2,labels=as.character(table_fjune_top5 [1:5]))

The same for the movies / TV shows reached the first place in weekly count

Average days in top distribution

#top 5
ad5_fjune <- as.data.frame (table_fjune_top5, stringsAsFActrors=FALSE)
ad5_fdjune <- as.data.frame (table_fdjune_top5, stringsAsFActrors=FALSE)
ad5_hdjune <- as.data.frame (table_hdjune_top5, stringsAsFActrors=FALSE)
ad5_hulu <- as.data.frame (table_hulu_top5, stringsAsFActrors=FALSE)
par (mfcol = c(1,4))
boxplot (ad5_fdjune$Freq, ylim=c(0,20), main = "Days in Top5, Disney+")
boxplot (ad5_hdjune$Freq, ylim=c(0,20), main = "Days in Top5, HBO Max")
boxplot (ad5_hulu$Freq, ylim=c(0,20), main = "Days in Top5, Hulu")
boxplot (ad5_fjune$Freq, ylim=c(0,20), main = "Days in Top5, Netflix")

The same for the movies / TV shows reached the first place in weekly count (#1)

Most popular on Netflix. Daily Tops for last 60 days

Everyday, around 9 pm, I get fresh portion of the Netflix Top movies / TV shows. I’ve been doing this for more than two months and decided to show the first results answering following questions:

How many movies / TV shows make the Top?
What movie was the longest #1 on Netflix?
What movie was the longest in Tops?
For how many days movies / TV shows stay in Tops and as #1?
To have a try to plot all this up and down zigzags…

I took 60 days span (almost all harvest so far) and Top5 overall, not Top10, in each category to talk about really the most popular and trendy.

So, let`s count how many movies made the Top5, I mean it is definitely less than 5 *60…

library(tidyverse)
library (stats)
library (gt)
fjune <- read_csv("R/movies/fjune.csv")
#wrangle raw data - reverse (fresh date first), take top 5, take last 60 days
fjune_dt %>% rev () %>% slice (1:5) %>% select (1:60)

#gather it together and count the number of unique titles 
fjune_dt_gathered <- gather (fjune_dt)
colnames (fjune_dt_gathered) <- c("date", "title")
unique_fjune_gathered <- unique (fjune_dt_gathered$title)

str (unique_fjune_gathered)
#chr [1:123] "Unknown Origins" "The Debt Collector 2" "Lucifer" "Casino Royale"

OK, it is 123 out of 300.
Now, it is good to have distribution in each #1 #2 #3 etc.

t_fjune_dt <- as.data.frame (t(fjune_dt),stringsAsFactors = FALSE)
# list of unique titles in each #1 #2 #3etc. for each day
unique_fjune <- sapply (t_fjune_dt, unique)

# number of unique titles in each #1 #2 #3 etc.
n_unique_fjune <- sapply (unique_fjune, length)
n_unique_fjune <- setNames (n_unique_fjune, c("Top1", "Top2", "Top3", "Top4", "Top5"))

n_unique_fjune
Top1 Top2 Top3 Top4 Top5
32   45   45   52   49

What movie was the longest in Tops / #1?

# Top5
table_fjune_dt5 <- sort (table (fjune_dt_gathered$title), decreasing = T)
# Top1
table_fjune_dt1 <- sort (table (t_fjune_dt$V1), decreasing = T)

# plotting the results
bb5 <- barplot (table_fjune_dt5 [1:5], ylim=c(0,22), main = "Days in Top5", las = 1, col = 'light blue')
text(bb5,table_fjune_dt5 [1:5] +1.2,labels=as.character(table_fjune_dt5 [1:5]))
bb1 <- barplot (table_fjune_dt1 [1:5], main = "Days as Top1", las = 1, ylim=c(0,6), col = 'light green')
text(bb1,table_fjune_dt1 [1:5] +0.5, labels=as.character(table_fjune_dt1 [1:5]))

Let`s weight and rank (1st = 5, 2nd = 4, 3rd = 3 etc.) Top 5 movies / TV shows during last 60 days.

i<- (5:1)
fjune_dt_gathered5 <- cbind (fjune_dt_gathered, i) 
fjune_dt_weighted % group_by(title) %>% summarise(sum = sum (i)) %>% arrange (desc(sum))
top_n (fjune_dt_weighted, 10) %>% gt ()

As we see the longer movies stays in Top the better rank they have with simple weighting meaning “lonely stars”, which got the most #1, draw less top attention through the time span.

Average days in top

av_fjune_dt5 <- round (nrow (fjune_dt_gathered) / length (unique_fjune_gathered),1) # in Top5
av_fjune_dt1 <- round (nrow (t_fjune_dt) / length (unique_fjune$V1),1) #as Top1

cat("Average days in top5: ", av_fjune_dt5, "\n") 
cat("Average days as top1: ", av_fjune_dt1)
#Average days in top5: 2.4
#Average days as top1: 1.9

Average days in top distribution

as5 <- as.data.frame (table_fjune_dt5, stringsAsFActrors=FALSE) 

as1 <- as.data.frame (table_fjune_dt1, stringsAsFActrors=FALSE) 
par (mfcol = c(1,2)) 
boxplot (as5$Freq, ylim=c(0,7), main = "Days in Top5") 
boxplot (as1$Freq, ylim=c(0,7), main = "Days as Top1")

And now, let`s try to plot Top5 movies / TV shows daily changes.
I played with different number of days and picked 4 – with longer span I had less readable plots since every new day brings new comers into the Top with its own line, position in the legend, and, ideally its own color.

fjune_dt_gathered55 <- fjune_dt_gathered5 %>% slice (1:20) %>% rev ()
ggplot(fjune_dt_gathered55)+ aes (x=date, y=i, group = title) + geom_line (aes (color = title), size = 2) +
geom_point (aes (color = title), size = 5)+ theme_classic()+
theme(legend.position="right")+ylab("top")+ ylim ("5", "4", "3", "2", "1")+
geom_line(aes(colour = title), arrow = arrow(angle = 15, ends = "last", type = "closed"))+scale_color_brewer(palette="Paired")

Netflix vs Disney+. Who has more fresh titles?

“Let`s shift to Disney+”, said my wife desperately browsing Netflix on her phone. “Netflix has much more fresh content” I argued. And…I realized I need numbers to close these family debates…

I choose “2018 and later” criteria as “fresh” and “1999 and earlier” as “old”. Those criteria are not strict but I needed certainty to have completely quantified arguments to keep my Netflix on the family throne.

Since I did not find such numbers on both streaming services websites I decided to scrape full lists of movies and TV shows and calculate % of “fresh” / “old” movies in entire lists. Likely, there are different websites listing all movies and TV shows both on Disney+ and Netflix and updating them daily. I choose one based on popularity (I checked it both with Alexa and Similar Web) and its architecture making my scraping job easy and predictable.

library (tidyverse)
library (rvest)
disney0 <- read_html("https://www.finder.com/complete-list-disney-plus-movies-tv-shows-exclusives/")

Locate proper CSS element using Selector Gadget

disney_year_0 <- html_nodes(disney0, 'td:nth-child(2)')
disney_year <- html_text (disney_year_0, trim=TRUE)
# list of release years for every title
length (disney_year) #1029

Now, let`s create frequency table for each year of release

disney_year_table <- as.data.frame (sort (table (disney_year), decreasing = TRUE), stringsAsFactors = FALSE)
colnames (disney_year_table) <- c('years', 'count') 

glimpse (disney_year_table)
Rows: 89 Columns: 2 $ years  "2019", "2017", "2016", "2018", "2015", "2011", "2014", "2012", "2010", "200... $ count  71, 58, 48, 40, 36, 35, 35, 34, 31, 28, 28, 27, 27, 25, 24, 24, 23, 23, 21,

Finally, count ‘fresh’ and 'old' % in entire Disney+ offer

disney_year_table$years <- disney_year_table$years %>% as.numeric () 

fresh_disney_years <- disney_year_table$years %in% c(2018:2020) 
fresh_disney_num <- paste (round (sum (disney_year_table$count[fresh_disney_years]) / sum (disney_year_table$count)*100),'%') 

old_disney_years <- disney_year_table$years < 2000
old_disney_num <- paste (round (sum (disney_year_table$count[old_disney_years]) / sum (disney_year_table$count)*100),'%')

So, I`ve got roughly 14% “fresh” and 33% “old” movie on Disney+. Let`s see the numbers for Netflix.
For Netflix content our source website does not have single page so I scraped movies and TV shows separately.

#movies
netflix_mov0 <- read_html("https://www.finder.com/netflix-movies/")
netflix_mov_year_0 <- html_nodes(netflix_mov0, 'td:nth-child(2)') 
netflix_mov_year <- tibble (html_text (netflix_mov_year_0, trim=TRUE))
colnames (netflix_mov_year) <- "year" 

#TV shows 
netflix_tv0 <- read_html("https://www.finder.com/netflix-tv-shows/")
netflix_tv_year_0 <- html_nodes(netflix_tv0, 'td:nth-child(2)')
netflix_tv_year <- tibble (html_text (netflix_tv_year_0, trim=TRUE))
colnames (netflix_tv_year) <- "year"

Code for final count of the at Neflix ‘fresh’ and ‘old’ movies / TV shows portions is almost the same as for Disney

netflix_year <- rbind (netflix_mov_year, netflix_tv_year)
nrow (netflix_year)
netflix_year_table <- as.data.frame (sort (table (netflix_year), decreasing = TRUE), stringsAsFactors = FALSE) colnames (netflix_year_table) <- c('years', 'count')

glimpse (netflix_year_table)
Rows: 68
Columns: 2
$ years  2018, 2019, 2017, 2016, 2015, 2014, 2020, 2013, 2012, 2010,
$ count  971, 882, 821, 653, 423, 245, 198, 184, 163, 127, 121, 108,

Count 'fresh' and 'old' % in entire offer

netflix_year_table$years <- netflix_year_table$years %>% as.numeric ()

fresh_netflix_years <- netflix_year_table$years %in% c(2018:2020)
fresh_netflix_num <- paste (round (sum (netflix_year_table$count[fresh_netflix_years]) / sum (netflix_year_table$count)*100),'%') 

old_netflix_years <- netflix_year_table$years < 2000
old_netflix_years <- na.omit (old_netflix_years)
old_netflix_num <- paste (round (sum (netflix_year_table$count[old_netflix_years]) / sum (netflix_year_table$count)*100),'%')

So, finally, I can build my arguments based on numbers, freshly baked and bold

How to measure movies vocabulary and display results in nice tables using tidytext and gt packages. Movies text analysis. Part 2.

My primary goal was to research popular movies vocabulary – words all those heroes actually saying to us and put well known movies onto the scale from the ones with the widest lexicon down to below average range despite possible crowning by IMDB rating, Box Office revenue or number / lack of Academy Awards.

Main unit of analysis is Number of unique words per 1000 which says about how WIDE the vocabulary of each movie / TV show is. I saw and personally made similar analysis for literature, music or web content but never for real top movies texts, not scripts. This post is the 2nd Part of the Movies text analysis and covers analysis itself and displaying results in nice looking tables using gt package. Part 1, previously published on R-Bloggers, covers preparation stage and movie texts cleaning using textclean package. I created CSV file featuring movies:

Movies_prep <- read_csv("R/movies/Movies_prep.csv")
Movies_prep [1:5,]

Resulting table will be plugged with three more parameters needed to be calculated.

Number of words (for entire movie).
Words per minute (Number of words / Movie length).
Vocabulary (Number of unique words per 1000 words).

library (tidyverse)    
library (tidytext)
library (textstem)
library (gt)

Bind clean text (described in Movies text analysis. Part 1) to the titles

movies_ <- select (Movies_prep, title = title)
movies_bind <- cbind (movies_, text)

Let`s use tidytext package to Unnest text to words

 movies_words  <- unnest_tokens (movies_bind, word, text)

Simply calculate nwords and words per minute for each movie

title_ <- movies_bind$title
movies_nword <- function (i){movies_nwords1<- movies_words %>% filter (title==i) %>% nrow ()
movies_nwords1}
movies_nwords <- sapply (title_, movies_nword)

Movies_prep1 <-  Movies_prep %>% mutate (words = movies_nwords, wpminute = round (movies_nwords/duration))

Let`s look at the Words per minute parameter first.

wpm_summary <- Movies_prep1$wpminute %>% summary ()

wpm_summary Min. 1st Qu. Median Mean 3rd Qu. Max. 37.00 66.00 80.00 85.89 104.00 182.00 Let`s check ten most wordy movies in our dataset

Movies_prep1_GT <- Movies_prep1 %>% select (title, wpminute, genre, type) 
Movies_preptop_GT <- Movies_prep1_GT %>% arrange (desc(wpminute)) %>% slice (1:10) %>% gt ()

I added tiny command gt () from gt package and my slice became nice looking table:

The same for ten least wordy movies

Movies_prepbot_GT <- Movies_prep1_GT %>% arrange (wpminute) %>% slice (1:10) %>% gt ()

Only Reality Show I added as control value is obvious outlier. While the most silent are well known epic movies. We will explore gt package more deep a later. Now, let`s check the range for Total Number of words

words_summary <- Movies_prep1$words %>% summary ()

Min. 1st Qu. Median Mean 3rd Qu. Max.
5981 8619 10311 10815 12467 18710

I wish all of them were exact 10,000 words length. Or any other but equal length for all movies. Real life is not so round. Unlike music vocabulary, we cannot take "99.7 songs" to have the same length for every peer. Why we need that? We cannot properly compare Number of unique words within different pieces of text unless they all are the same length. Any one sentence has up to 100% words uniqueness, 100 sentences - up to 50%. Any entire movie – much less due to marginal saturation e.g. usage the words we already used.
Before we solve this problem we should lemmatize clean text.

movies_lem<- lemmatize_words (movies_words$word)
movies <- movies_words %>% mutate (word = movies_lem)

And, vocabulary unique words per 1000 for each movie (I use minimal length along the all movies in dataset with sampling all movies text the same size = length of the shortest (N of words) movie.

nwords_min <- min(movies_nwords)
vocab1 <- function (z) {vocab2<- movies_words %>% filter (title==z)
vocab3<- as.data.frame (replicate (5, sample (vocab2$word, nwords_min, replace = FALSE)), stringsAsFactors = FALSE)
vocab4 <- round (mean (sapply (sapply (vocab3, unique), length))/nwords_min *1000)
vocab4}
vocab <- sapply (title_, vocab1)

Summary (vocab)
Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
152.0   176.0   188.0   188.7   200.0   251.0 

Movies_final <-  Movies_prep1 %>% mutate (vocabulary = vocab)

I plan to compare and visualize how wide vocabulary is within iconic movies, different genres, Movies vs TV Shows, IMDB TOP 100 vs Box Office All Time Best etc. in my next post (Movies text analysis. Part 3.) For this part I want to explore further gt package and display movies ranking by its vocabulary:

Movies_final_GT <- Movies_final %>% select (title, vocabulary, genre, type, feature) %>%
  top_n (20, vocabulary) %>% mutate (N = seq (20))
Movies_final_GT <- Movies_final_GT [,c(6,1,2,3,4,5)] %>% gt() %>%
  tab_header(
  title = "Number of Unique Words Used in Movie / TV Show (per each 1000 words)")

Using gt package we put top 15 movies by number of unique words in table. This time with header.

And we can easily save the table as graphical output.

gtsave(Movies_final_GT, filename = 'Movies_final_GT.png')
gtsave(Movies_final_GT, filename = 'Movies_final_GT.pdf')

Let`s format the table a bit.

Movies_final_GTT <- Movies_final_GT %>%
    tab_style(
    style = list(
      cell_text(weight = "bold")),
    locations = cells_column_labels(
      columns = vars(N, title, vocabulary, genre, type, feature))) %>% 
  tab_style(
  style = list(
      cell_text(weight = "bold")),
locations = cells_body(
  columns = vars(title, vocabulary))) %>%
  tab_style(
    style = list(
      cell_text(style = "italic")),
locations = cells_body(
        columns = vars(title, type)))

We can change cells and text colors and even make it conditional (for instance, only ‘type == TV’).

Movies_final_GTT <- Movies_final_GTT %>%
  tab_style(
    style = list(
      cell_text(color = "blue")),
locations = cells_body(
      columns = vars(vocabulary))) %>%
  tab_style(
    style = list(
      cell_fill(color = "#F9E3D6")),
locations = cells_body(
      columns = vars(type),
      rows = type == "TV"))

That`s it for tables and for Part 2 of my Movies text analysis. In Part 3 I plan to use ggplot2 to visualize and compare how wide vocabulary is within iconic movies, different genres, Movies vs TV Shows, IMDB TOP 100 vs Box Office All Time Best etc. See ya.

Easy text loading and cleaning using readtext and textclean packages. Movies text analysis. Part 1

This post is the 1^st Part of the Movies text analysis and covers Text loading and cleaning using readtext and textclean packages. My first challenge was texts source – I saw text mining projects based on open data scripts but I wanted something new, more direct and, as close to what we actually hear AND read from the screen. I decided directly save subtitles while streaming using Chrome developer panel.

It requires further transformation through the chain .xml > .srt > .txt (not covered here).
Since we have .txt files ready, let`s start from downloading of texts stored in separate folder (1movie = 1 file).

library(readtext)
text_raw <-readtext(paste0(path="C:/Users/Andrew/Documents/R/movies/txt/*"))
text <- gsub("\r?\n|\r", " ", text_raw)

To go further it is important to understand what is text and what is noise here. Since subtitles are real text representation of entire movie, its text part contains:

Speech (actual text)
Any sound (made by heroes/characters or around)
Speakers / characters marks (who is saying)
Intro / end themes
Other text

All but first are noise, at least for purpose of measuring and compering vocabulary / lexicon. Luckily, there are limited number of patterns to distinct Speech from other sound / noise so using string processing is possible to husk the speech leaving noise apart. Let`s start from the brackets and clean not only brackets but what is inside also using textclean package.

library (textclean)

text1 <- str_replace_all (text, "\\[.*?\\]", "")
text1_nchar <- sum (nchar (text1))

It is important to check how hard your trim your "garden", you could do it element-wise if the summary shows any odd losses or, in opposite, very little decrease of characters #.

the same with () +()

text2 <- str_replace_all(text1, "\\(.*?\\)", "")
text2_nchar <- sum (nchar (text2))

Do not forget any odd characters detected, like, in my case, ‘ï»¿File:’

text3 <- str_replace (text2,'ï»¿File:', '')
text3_nchar <- sum (nchar (text3))

Now is the trickiest part, other brackets - ♪♪ Ideally, it requires you to know the text well to distinct theme songs and background music from the situation where singing is the way the heroes “talk” to us. For instance musicals or a lot of kids movies.

text4 <- str_replace_all(text3,"\\âTª.*?\\âTª", "")
text4_nchar <- sum (nchar (text4))

Speech markers e.g. JOHN:

text5 <- str_replace_all(text4,"[a-zA-Z]+:", "")
text5_nchar <- sum (nchar (text5))

Check % of total cut

Stop cut the weed, let`s replace the rest): @#$%&

text6 <- replace_symbol(text5, dollar = TRUE, percent = TRUE, pound = TRUE,
                        at = TRUE, and = TRUE, with = TRUE)
text6_nchar <- sum (nchar (text6))

#time stamps
text7 <- replace_time(text6, pattern = "(2[0-3]|[01]?[0-9]):([0-5][0-9])[.:]?([0-5]?[0-9])?")
text7_nchar <- sum (nchar (text7))

#date to text
replace_date_pattern <- paste0(
  '([01]?[0-9])[/-]([0-2]?[0-9]|3[01])[/-]\\d{4}|\\d{4}[/-]', 
  '([01]?[0-9])[/-]([0-2]?[0-9]|3[01])')
text8 <- replace_time(text7, pattern = replace_date_pattern)
text8_nchar <- sum (nchar (text8))

#numbers to text, e.g. 8 = eight
text9 <- replace_number(text8, num.paste = FALSE, remove = FALSE)
text9_nchar <- sum (nchar (text9))

Now texts are clean enough and ready for further analysis. That`s it for Movies text analysis. Part 1.

In Movies text analysis. Part 2. I plan to compare how wide, in general, vocabulary is within iconic movies, different genres, Movies vs TV Shows, IMDB TOP 100 vs Box Office All Time Best etc.

In addition, having text data and the movies text length I Planned to check #of words per minute parameter to defy or confirm hypothesis that movies speech is significantly different in range and slower overall than normal speech benchmark in 120-150 WPM (words per minute).