The Sentiment of PewDiePie

PiewDiePie announced to make a YouTube pause just a few moments ago, but he is still one of the best known human brands on YouTube. I tried to visualise the sentiment development of his video comments.

This analysis uses some R packages. The dataset was exported form mysql using the following sql-command:

create table top_comments as select REPLACE(REPLACE(REPLACE(REPLACE(text, '\r', ''), '\n', ' '), '"', ' '), ',',' ')  as text, publishedAt from comments where likeCount > 1 as text, publishedAt from comments where likeCount > 10;

So I used only comments with more than 10 likes and stored the result in a csv called "top_comments", which I will use later on.

library(syuzhet)
library(ggplot2)
library(readr)
library(feather)
library(plyr)
comments_raw <- read_csv("../data/top_comments.csv", col_types = cols(
  text = col_character(),
  publishedAt = col_datetime(format = "")
))

# remove na rows
comments_cleaned <- na.omit(comments_raw)

# remove empty text rows
comments_cleaned <- comments_cleaned[which(nchar(comments_cleaned$text) > 3),]

randomRows = function(df,n){
   return(df[sample(nrow(df),n),])
}
comments <- comments_cleaned
#comments <- randomRows(comments_cleaned, 10000)
     text            publishedAt                 
 Length:208128      Min.   :2013-11-07 00:25:17  
 Class :character   1st Qu.:2014-06-28 00:28:10  
 Mode  :character   Median :2015-08-31 19:49:03  
                    Mean   :2015-06-13 07:06:28  
                    3rd Qu.:2016-06-02 18:43:47  
                    Max.   :2016-11-14 01:15:46  
analyse_sentiment <- function (text){
    a <- get_sentiment(text, method = "afinn")
    s <- get_sentiment(text, method = "syuzhet")
    b <- get_sentiment(text, method = "bing")
    n <- get_sentiment(text, method = "nrc")

    m <- mean(c(a, s, b, n), na.rm=TRUE)

    if (is.numeric(m) & !is.na(m)){
        return(m)
    }

    return(0.0)
}

comments$sentiment <- apply(comments[c("text")],1, analyse_sentiment)
write_feather(comments, "../data/top_comments.feather")
comments <- read_feather("../data/top_comments.feather")
options(repr.plot.width = 10, repr.plot.height = 4)

hist(comments$sentiment, main = "Sentiment Histogram", xlab="Sentiment")

with(comments, {
    comments$day <- floor_date(comments$publishedAt, "day")
    daily_grouped_sentiment <- ddply(comments, .(day),summarize, sentiment=mean(sentiment))

    ggplot(data = daily_grouped_sentiment, aes(x = day, y = sentiment)) +
        geom_bar(stat = "identity", fill = "coral3") + 
        labs(title = "Sentiment in Development (daily resolution)", y="Sentiment", x="Day") +   
        theme_minimal()
})

comments$week <- as.Date("1970-01-01")+7*trunc(as.numeric(comments$publishedAt)/(3600*24*7))
weekly_grouped_comments <- ddply(comments, .(week),summarize, sentiment=mean(sentiment))

hist(weekly_grouped_comments$sentiment, main = "Sentiment Histogram", xlab="Sentiment")

toD <- function(date){
    return(as.Date(date, "%d/%m/%Y"))
}

dfrLabels <- data.frame(
    date  = toD(c("10/02/2016", "29/08/2014", "18/12/2014")),
    event = c("YTRED", "Goodbye comments", "Asked for support"),
    arrow_spire = rep(0.2, 3),
    arrow_start = c(0.45, 0.55, 0.45),
    text_start = c(0.5, 0.6 ,0.5)
)

g <- ggplot(data = weekly_grouped_comments, aes(x = week, y = sentiment)) +
        geom_bar(stat = "identity", fill = "coral3") + 
        geom_text(data= dfrLabels, aes(x=date, y=text_start, label=event), size = 3, color="#333333") +
        geom_segment(data = dfrLabels, aes(x = date, y = arrow_start, xend = date, yend = arrow_spire), arrow = arrow(length = unit(0.01, "npc"))) +
        theme_minimal()

p + labs(title = "PewDiePie comment Sentiment (weekly resolution)", x="Week", y="Sentiment")

grouped_sentiment[which(grouped_sentiment$sentiment < -0.5 ),]
week sentiment
59 2014-12-18 -0.5826566

Now I want to find the great negative spike around the 18nd Dec 2014. Therefore I group the data daily around that date.

library(lubridate)

s_date <- as.Date("10/12/2014", "%d/%m/%Y")
e_date <- as.Date("31/12/2014", "%d/%m/%Y")

with(comments, {
    event_comments <- comments[which(as.Date(publishedAt) > s_date & as.Date(publishedAt) < e_date),]
    event_comments$day <- floor_date(event_comments$publishedAt, "day")
    daily_grouped_comments <- ddply(event_comments, .(day),summarize, sentiment=mean(sentiment))

    ggplot(data = daily_grouped_comments, aes(x = day, y = sentiment)) +
        geom_bar(stat = "identity", fill = "coral3") + 
        theme_minimal() +
        labs(title = "PewDiePie's YouTube comments sentiment (end of December 2014)", x="Day", y="Sentiment")
})