Sourabh Joshi Final project

The project will involve the following steps: 1. Data preprocessing, 2. Model development for regression, 3. Model development for classification, 4. Model interpretation, 5. Model validation.

library(caret)
library(dplyr)
library(pROC)

Data preprocessing

#loading the data 
library(readr)
data <- read_csv("~/Downloads/OnlineNewsPopularity/OnlineNewsPopularity.csv")
str(data)

## spc_tbl_ [39,644 × 61] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ url                          : chr [1:39644] "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num [1:39644] 731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num [1:39644] 12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num [1:39644] 219 255 211 531 1072 ...
##  $ n_unique_tokens              : num [1:39644] 0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num [1:39644] 1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num [1:39644] 0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num [1:39644] 4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num [1:39644] 2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num [1:39644] 1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num [1:39644] 4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num [1:39644] 5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num [1:39644] 0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num [1:39644] 1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num [1:39644] 0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num [1:39644] 0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num [1:39644] 496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num [1:39644] 496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num [1:39644] 496 0 918 0 3151 ...
##  $ weekday_is_monday            : num [1:39644] 1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num [1:39644] 0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num [1:39644] 0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num [1:39644] 0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num [1:39644] 0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num [1:39644] 0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num [1:39644] 0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num [1:39644] 0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num [1:39644] 0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num [1:39644] 0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num [1:39644] 0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num [1:39644] 0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num [1:39644] 0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num [1:39644] 0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num [1:39644] 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num [1:39644] -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num [1:39644] -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num [1:39644] -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num [1:39644] 0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num [1:39644] -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num [1:39644] 0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num [1:39644] 0.188 0 0 0 0.136 ...
##  $ shares                       : num [1:39644] 593 711 1500 1200 505 855 556 891 3600 710 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   url = col_character(),
##   ..   timedelta = col_double(),
##   ..   n_tokens_title = col_double(),
##   ..   n_tokens_content = col_double(),
##   ..   n_unique_tokens = col_double(),
##   ..   n_non_stop_words = col_double(),
##   ..   n_non_stop_unique_tokens = col_double(),
##   ..   num_hrefs = col_double(),
##   ..   num_self_hrefs = col_double(),
##   ..   num_imgs = col_double(),
##   ..   num_videos = col_double(),
##   ..   average_token_length = col_double(),
##   ..   num_keywords = col_double(),
##   ..   data_channel_is_lifestyle = col_double(),
##   ..   data_channel_is_entertainment = col_double(),
##   ..   data_channel_is_bus = col_double(),
##   ..   data_channel_is_socmed = col_double(),
##   ..   data_channel_is_tech = col_double(),
##   ..   data_channel_is_world = col_double(),
##   ..   kw_min_min = col_double(),
##   ..   kw_max_min = col_double(),
##   ..   kw_avg_min = col_double(),
##   ..   kw_min_max = col_double(),
##   ..   kw_max_max = col_double(),
##   ..   kw_avg_max = col_double(),
##   ..   kw_min_avg = col_double(),
##   ..   kw_max_avg = col_double(),
##   ..   kw_avg_avg = col_double(),
##   ..   self_reference_min_shares = col_double(),
##   ..   self_reference_max_shares = col_double(),
##   ..   self_reference_avg_sharess = col_double(),
##   ..   weekday_is_monday = col_double(),
##   ..   weekday_is_tuesday = col_double(),
##   ..   weekday_is_wednesday = col_double(),
##   ..   weekday_is_thursday = col_double(),
##   ..   weekday_is_friday = col_double(),
##   ..   weekday_is_saturday = col_double(),
##   ..   weekday_is_sunday = col_double(),
##   ..   is_weekend = col_double(),
##   ..   LDA_00 = col_double(),
##   ..   LDA_01 = col_double(),
##   ..   LDA_02 = col_double(),
##   ..   LDA_03 = col_double(),
##   ..   LDA_04 = col_double(),
##   ..   global_subjectivity = col_double(),
##   ..   global_sentiment_polarity = col_double(),
##   ..   global_rate_positive_words = col_double(),
##   ..   global_rate_negative_words = col_double(),
##   ..   rate_positive_words = col_double(),
##   ..   rate_negative_words = col_double(),
##   ..   avg_positive_polarity = col_double(),
##   ..   min_positive_polarity = col_double(),
##   ..   max_positive_polarity = col_double(),
##   ..   avg_negative_polarity = col_double(),
##   ..   min_negative_polarity = col_double(),
##   ..   max_negative_polarity = col_double(),
##   ..   title_subjectivity = col_double(),
##   ..   title_sentiment_polarity = col_double(),
##   ..   abs_title_subjectivity = col_double(),
##   ..   abs_title_sentiment_polarity = col_double(),
##   ..   shares = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

#changing to data frame
df <- data.frame(data)
str(df)

## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : chr  "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : num  593 711 1500 1200 505 855 556 891 3600 710 ...

##summary of the data 
summary(df)

##      url              timedelta     n_tokens_title n_tokens_content
##  Length:39644       Min.   :  8.0   Min.   : 2.0   Min.   :   0.0  
##  Class :character   1st Qu.:164.0   1st Qu.: 9.0   1st Qu.: 246.0  
##  Mode  :character   Median :339.0   Median :10.0   Median : 409.0  
##                     Mean   :354.5   Mean   :10.4   Mean   : 546.5  
##                     3rd Qu.:542.0   3rd Qu.:12.0   3rd Qu.: 716.0  
##                     Max.   :731.0   Max.   :23.0   Max.   :8474.0  
##  n_unique_tokens    n_non_stop_words    n_non_stop_unique_tokens
##  Min.   :  0.0000   Min.   :   0.0000   Min.   :  0.0000        
##  1st Qu.:  0.4709   1st Qu.:   1.0000   1st Qu.:  0.6257        
##  Median :  0.5392   Median :   1.0000   Median :  0.6905        
##  Mean   :  0.5482   Mean   :   0.9965   Mean   :  0.6892        
##  3rd Qu.:  0.6087   3rd Qu.:   1.0000   3rd Qu.:  0.7546        
##  Max.   :701.0000   Max.   :1042.0000   Max.   :650.0000        
##    num_hrefs      num_self_hrefs       num_imgs         num_videos   
##  Min.   :  0.00   Min.   :  0.000   Min.   :  0.000   Min.   : 0.00  
##  1st Qu.:  4.00   1st Qu.:  1.000   1st Qu.:  1.000   1st Qu.: 0.00  
##  Median :  8.00   Median :  3.000   Median :  1.000   Median : 0.00  
##  Mean   : 10.88   Mean   :  3.294   Mean   :  4.544   Mean   : 1.25  
##  3rd Qu.: 14.00   3rd Qu.:  4.000   3rd Qu.:  4.000   3rd Qu.: 1.00  
##  Max.   :304.00   Max.   :116.000   Max.   :128.000   Max.   :91.00  
##  average_token_length  num_keywords    data_channel_is_lifestyle
##  Min.   :0.000        Min.   : 1.000   Min.   :0.00000          
##  1st Qu.:4.478        1st Qu.: 6.000   1st Qu.:0.00000          
##  Median :4.664        Median : 7.000   Median :0.00000          
##  Mean   :4.548        Mean   : 7.224   Mean   :0.05295          
##  3rd Qu.:4.855        3rd Qu.: 9.000   3rd Qu.:0.00000          
##  Max.   :8.042        Max.   :10.000   Max.   :1.00000          
##  data_channel_is_entertainment data_channel_is_bus data_channel_is_socmed
##  Min.   :0.000                 Min.   :0.0000      Min.   :0.0000        
##  1st Qu.:0.000                 1st Qu.:0.0000      1st Qu.:0.0000        
##  Median :0.000                 Median :0.0000      Median :0.0000        
##  Mean   :0.178                 Mean   :0.1579      Mean   :0.0586        
##  3rd Qu.:0.000                 3rd Qu.:0.0000      3rd Qu.:0.0000        
##  Max.   :1.000                 Max.   :1.0000      Max.   :1.0000        
##  data_channel_is_tech data_channel_is_world   kw_min_min       kw_max_min    
##  Min.   :0.0000       Min.   :0.0000        Min.   : -1.00   Min.   :     0  
##  1st Qu.:0.0000       1st Qu.:0.0000        1st Qu.: -1.00   1st Qu.:   445  
##  Median :0.0000       Median :0.0000        Median : -1.00   Median :   660  
##  Mean   :0.1853       Mean   :0.2126        Mean   : 26.11   Mean   :  1154  
##  3rd Qu.:0.0000       3rd Qu.:0.0000        3rd Qu.:  4.00   3rd Qu.:  1000  
##  Max.   :1.0000       Max.   :1.0000        Max.   :377.00   Max.   :298400  
##    kw_avg_min        kw_min_max       kw_max_max       kw_avg_max    
##  Min.   :   -1.0   Min.   :     0   Min.   :     0   Min.   :     0  
##  1st Qu.:  141.8   1st Qu.:     0   1st Qu.:843300   1st Qu.:172847  
##  Median :  235.5   Median :  1400   Median :843300   Median :244572  
##  Mean   :  312.4   Mean   : 13612   Mean   :752324   Mean   :259282  
##  3rd Qu.:  357.0   3rd Qu.:  7900   3rd Qu.:843300   3rd Qu.:330980  
##  Max.   :42827.9   Max.   :843300   Max.   :843300   Max.   :843300  
##    kw_min_avg     kw_max_avg       kw_avg_avg    self_reference_min_shares
##  Min.   :  -1   Min.   :     0   Min.   :    0   Min.   :     0           
##  1st Qu.:   0   1st Qu.:  3562   1st Qu.: 2382   1st Qu.:   639           
##  Median :1024   Median :  4356   Median : 2870   Median :  1200           
##  Mean   :1117   Mean   :  5657   Mean   : 3136   Mean   :  3999           
##  3rd Qu.:2057   3rd Qu.:  6020   3rd Qu.: 3600   3rd Qu.:  2600           
##  Max.   :3613   Max.   :298400   Max.   :43568   Max.   :843300           
##  self_reference_max_shares self_reference_avg_sharess weekday_is_monday
##  Min.   :     0            Min.   :     0.0           Min.   :0.000    
##  1st Qu.:  1100            1st Qu.:   981.2           1st Qu.:0.000    
##  Median :  2800            Median :  2200.0           Median :0.000    
##  Mean   : 10329            Mean   :  6401.7           Mean   :0.168    
##  3rd Qu.:  8000            3rd Qu.:  5200.0           3rd Qu.:0.000    
##  Max.   :843300            Max.   :843300.0           Max.   :1.000    
##  weekday_is_tuesday weekday_is_wednesday weekday_is_thursday weekday_is_friday
##  Min.   :0.0000     Min.   :0.0000       Min.   :0.0000      Min.   :0.0000   
##  1st Qu.:0.0000     1st Qu.:0.0000       1st Qu.:0.0000      1st Qu.:0.0000   
##  Median :0.0000     Median :0.0000       Median :0.0000      Median :0.0000   
##  Mean   :0.1864     Mean   :0.1875       Mean   :0.1833      Mean   :0.1438   
##  3rd Qu.:0.0000     3rd Qu.:0.0000       3rd Qu.:0.0000      3rd Qu.:0.0000   
##  Max.   :1.0000     Max.   :1.0000       Max.   :1.0000      Max.   :1.0000   
##  weekday_is_saturday weekday_is_sunday   is_weekend         LDA_00       
##  Min.   :0.00000     Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000     1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.02505  
##  Median :0.00000     Median :0.00000   Median :0.0000   Median :0.03339  
##  Mean   :0.06188     Mean   :0.06904   Mean   :0.1309   Mean   :0.18460  
##  3rd Qu.:0.00000     3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.24096  
##  Max.   :1.00000     Max.   :1.00000   Max.   :1.0000   Max.   :0.92699  
##      LDA_01            LDA_02            LDA_03            LDA_04       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.02501   1st Qu.:0.02857   1st Qu.:0.02857   1st Qu.:0.02857  
##  Median :0.03334   Median :0.04000   Median :0.04000   Median :0.04073  
##  Mean   :0.14126   Mean   :0.21632   Mean   :0.22377   Mean   :0.23403  
##  3rd Qu.:0.15083   3rd Qu.:0.33422   3rd Qu.:0.37576   3rd Qu.:0.39999  
##  Max.   :0.92595   Max.   :0.92000   Max.   :0.92653   Max.   :0.92719  
##  global_subjectivity global_sentiment_polarity global_rate_positive_words
##  Min.   :0.0000      Min.   :-0.39375          Min.   :0.00000           
##  1st Qu.:0.3962      1st Qu.: 0.05776          1st Qu.:0.02838           
##  Median :0.4535      Median : 0.11912          Median :0.03902           
##  Mean   :0.4434      Mean   : 0.11931          Mean   :0.03962           
##  3rd Qu.:0.5083      3rd Qu.: 0.17783          3rd Qu.:0.05028           
##  Max.   :1.0000      Max.   : 0.72784          Max.   :0.15549           
##  global_rate_negative_words rate_positive_words rate_negative_words
##  Min.   :0.000000           Min.   :0.0000      Min.   :0.0000     
##  1st Qu.:0.009615           1st Qu.:0.6000      1st Qu.:0.1852     
##  Median :0.015337           Median :0.7105      Median :0.2800     
##  Mean   :0.016612           Mean   :0.6822      Mean   :0.2879     
##  3rd Qu.:0.021739           3rd Qu.:0.8000      3rd Qu.:0.3846     
##  Max.   :0.184932           Max.   :1.0000      Max.   :1.0000     
##  avg_positive_polarity min_positive_polarity max_positive_polarity
##  Min.   :0.0000        Min.   :0.00000       Min.   :0.0000       
##  1st Qu.:0.3062        1st Qu.:0.05000       1st Qu.:0.6000       
##  Median :0.3588        Median :0.10000       Median :0.8000       
##  Mean   :0.3538        Mean   :0.09545       Mean   :0.7567       
##  3rd Qu.:0.4114        3rd Qu.:0.10000       3rd Qu.:1.0000       
##  Max.   :1.0000        Max.   :1.00000       Max.   :1.0000       
##  avg_negative_polarity min_negative_polarity max_negative_polarity
##  Min.   :-1.0000       Min.   :-1.0000       Min.   :-1.0000      
##  1st Qu.:-0.3284       1st Qu.:-0.7000       1st Qu.:-0.1250      
##  Median :-0.2533       Median :-0.5000       Median :-0.1000      
##  Mean   :-0.2595       Mean   :-0.5219       Mean   :-0.1075      
##  3rd Qu.:-0.1869       3rd Qu.:-0.3000       3rd Qu.:-0.0500      
##  Max.   : 0.0000       Max.   : 0.0000       Max.   : 0.0000      
##  title_subjectivity title_sentiment_polarity abs_title_subjectivity
##  Min.   :0.0000     Min.   :-1.00000         Min.   :0.0000        
##  1st Qu.:0.0000     1st Qu.: 0.00000         1st Qu.:0.1667        
##  Median :0.1500     Median : 0.00000         Median :0.5000        
##  Mean   :0.2824     Mean   : 0.07143         Mean   :0.3418        
##  3rd Qu.:0.5000     3rd Qu.: 0.15000         3rd Qu.:0.5000        
##  Max.   :1.0000     Max.   : 1.00000         Max.   :0.5000        
##  abs_title_sentiment_polarity     shares      
##  Min.   :0.0000               Min.   :     1  
##  1st Qu.:0.0000               1st Qu.:   946  
##  Median :0.0000               Median :  1400  
##  Mean   :0.1561               Mean   :  3395  
##  3rd Qu.:0.2500               3rd Qu.:  2800  
##  Max.   :1.0000               Max.   :843300

#missing values
anyNA(df)

## [1] FALSE

#checking for outliers
df=df[!df$n_non_stop_words==1042,]

summary(df$n_non_stop_words) #fixed

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.9702  1.0000  1.0000

#dropiing the url variable & timedelta
df1 <- df[, -1]
df1 <- df1[,-1]



# Any errors

#Checking importance of news subjects(categorical) on shares
for (i in 13:18){
  
  boxplot(log(df1$shares) ~ (df1[,i]), xlab=names(df1)[i] , ylab="shares")
}

#Checking importance of weekdays on news shares
for (i in 31:37){
  
  boxplot(log(df1$shares) ~ (df1[,i]), xlab=names(df1)[i] , ylab="shares")
}

#Converting categorical values from numeric to factor - Weekdays
#for (i in 31:37){
 # df1[,i] <- factor(df1[,i])
  
#}

Summary statistics

 library(psych)
summary(df1)

##  n_tokens_title n_tokens_content n_unique_tokens  n_non_stop_words
##  Min.   : 2.0   Min.   :   0.0   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 9.0   1st Qu.: 246.0   1st Qu.:0.4709   1st Qu.:1.0000  
##  Median :10.0   Median : 409.0   Median :0.5392   Median :1.0000  
##  Mean   :10.4   Mean   : 546.5   Mean   :0.5305   Mean   :0.9702  
##  3rd Qu.:12.0   3rd Qu.: 716.0   3rd Qu.:0.6087   3rd Qu.:1.0000  
##  Max.   :23.0   Max.   :8474.0   Max.   :1.0000   Max.   :1.0000  
##  n_non_stop_unique_tokens   num_hrefs      num_self_hrefs       num_imgs      
##  Min.   :0.0000           Min.   :  0.00   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:0.6257           1st Qu.:  4.00   1st Qu.:  1.000   1st Qu.:  1.000  
##  Median :0.6905           Median :  8.00   Median :  3.000   Median :  1.000  
##  Mean   :0.6728           Mean   : 10.88   Mean   :  3.293   Mean   :  4.543  
##  3rd Qu.:0.7546           3rd Qu.: 14.00   3rd Qu.:  4.000   3rd Qu.:  4.000  
##  Max.   :1.0000           Max.   :304.00   Max.   :116.000   Max.   :128.000  
##    num_videos    average_token_length  num_keywords   
##  Min.   : 0.00   Min.   :0.000        Min.   : 1.000  
##  1st Qu.: 0.00   1st Qu.:4.478        1st Qu.: 6.000  
##  Median : 0.00   Median :4.664        Median : 7.000  
##  Mean   : 1.25   Mean   :4.548        Mean   : 7.224  
##  3rd Qu.: 1.00   3rd Qu.:4.855        3rd Qu.: 9.000  
##  Max.   :91.00   Max.   :8.042        Max.   :10.000  
##  data_channel_is_lifestyle data_channel_is_entertainment data_channel_is_bus
##  Min.   :0.00000           Min.   :0.000                 Min.   :0.0000     
##  1st Qu.:0.00000           1st Qu.:0.000                 1st Qu.:0.0000     
##  Median :0.00000           Median :0.000                 Median :0.0000     
##  Mean   :0.05295           Mean   :0.178                 Mean   :0.1579     
##  3rd Qu.:0.00000           3rd Qu.:0.000                 3rd Qu.:0.0000     
##  Max.   :1.00000           Max.   :1.000                 Max.   :1.0000     
##  data_channel_is_socmed data_channel_is_tech data_channel_is_world
##  Min.   :0.0000         Min.   :0.0000       Min.   :0.0000       
##  1st Qu.:0.0000         1st Qu.:0.0000       1st Qu.:0.0000       
##  Median :0.0000         Median :0.0000       Median :0.0000       
##  Mean   :0.0586         Mean   :0.1853       Mean   :0.2126       
##  3rd Qu.:0.0000         3rd Qu.:0.0000       3rd Qu.:0.0000       
##  Max.   :1.0000         Max.   :1.0000       Max.   :1.0000       
##    kw_min_min       kw_max_min       kw_avg_min        kw_min_max    
##  Min.   : -1.00   Min.   :     0   Min.   :   -1.0   Min.   :     0  
##  1st Qu.: -1.00   1st Qu.:   445   1st Qu.:  141.8   1st Qu.:     0  
##  Median : -1.00   Median :   660   Median :  235.5   Median :  1400  
##  Mean   : 26.11   Mean   :  1154   Mean   :  312.4   Mean   : 13612  
##  3rd Qu.:  4.00   3rd Qu.:  1000   3rd Qu.:  357.0   3rd Qu.:  7900  
##  Max.   :377.00   Max.   :298400   Max.   :42827.9   Max.   :843300  
##    kw_max_max       kw_avg_max       kw_min_avg     kw_max_avg    
##  Min.   :     0   Min.   :     0   Min.   :  -1   Min.   :     0  
##  1st Qu.:843300   1st Qu.:172844   1st Qu.:   0   1st Qu.:  3562  
##  Median :843300   Median :244567   Median :1024   Median :  4356  
##  Mean   :752322   Mean   :259280   Mean   :1117   Mean   :  5657  
##  3rd Qu.:843300   3rd Qu.:330980   3rd Qu.:2057   3rd Qu.:  6020  
##  Max.   :843300   Max.   :843300   Max.   :3613   Max.   :298400  
##    kw_avg_avg    self_reference_min_shares self_reference_max_shares
##  Min.   :    0   Min.   :     0            Min.   :     0           
##  1st Qu.: 2382   1st Qu.:   639            1st Qu.:  1100           
##  Median : 2870   Median :  1200            Median :  2800           
##  Mean   : 3136   Mean   :  3999            Mean   : 10330           
##  3rd Qu.: 3600   3rd Qu.:  2600            3rd Qu.:  8000           
##  Max.   :43568   Max.   :843300            Max.   :843300           
##  self_reference_avg_sharess weekday_is_monday weekday_is_tuesday
##  Min.   :     0.0           Min.   :0.000     Min.   :0.0000    
##  1st Qu.:   981.1           1st Qu.:0.000     1st Qu.:0.0000    
##  Median :  2200.0           Median :0.000     Median :0.0000    
##  Mean   :  6401.7           Mean   :0.168     Mean   :0.1864    
##  3rd Qu.:  5200.0           3rd Qu.:0.000     3rd Qu.:0.0000    
##  Max.   :843300.0           Max.   :1.000     Max.   :1.0000    
##  weekday_is_wednesday weekday_is_thursday weekday_is_friday weekday_is_saturday
##  Min.   :0.0000       Min.   :0.0000      Min.   :0.0000    Min.   :0.00000    
##  1st Qu.:0.0000       1st Qu.:0.0000      1st Qu.:0.0000    1st Qu.:0.00000    
##  Median :0.0000       Median :0.0000      Median :0.0000    Median :0.00000    
##  Mean   :0.1875       Mean   :0.1833      Mean   :0.1438    Mean   :0.06188    
##  3rd Qu.:0.0000       3rd Qu.:0.0000      3rd Qu.:0.0000    3rd Qu.:0.00000    
##  Max.   :1.0000       Max.   :1.0000      Max.   :1.0000    Max.   :1.00000    
##  weekday_is_sunday   is_weekend         LDA_00            LDA_01       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.01818   Min.   :0.01818  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.02505   1st Qu.:0.02501  
##  Median :0.00000   Median :0.0000   Median :0.03339   Median :0.03334  
##  Mean   :0.06904   Mean   :0.1309   Mean   :0.18460   Mean   :0.14126  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.24097   3rd Qu.:0.15084  
##  Max.   :1.00000   Max.   :1.0000   Max.   :0.92699   Max.   :0.92595  
##      LDA_02            LDA_03            LDA_04        global_subjectivity
##  Min.   :0.01818   Min.   :0.01818   Min.   :0.01818   Min.   :0.0000     
##  1st Qu.:0.02857   1st Qu.:0.02857   1st Qu.:0.02857   1st Qu.:0.3962     
##  Median :0.04000   Median :0.04000   Median :0.04073   Median :0.4535     
##  Mean   :0.21633   Mean   :0.22378   Mean   :0.23404   Mean   :0.4434     
##  3rd Qu.:0.33422   3rd Qu.:0.37578   3rd Qu.:0.39999   3rd Qu.:0.5083     
##  Max.   :0.92000   Max.   :0.92653   Max.   :0.92719   Max.   :1.0000     
##  global_sentiment_polarity global_rate_positive_words
##  Min.   :-0.39375          Min.   :0.00000           
##  1st Qu.: 0.05776          1st Qu.:0.02839           
##  Median : 0.11912          Median :0.03902           
##  Mean   : 0.11931          Mean   :0.03963           
##  3rd Qu.: 0.17784          3rd Qu.:0.05028           
##  Max.   : 0.72784          Max.   :0.15549           
##  global_rate_negative_words rate_positive_words rate_negative_words
##  Min.   :0.000000           Min.   :0.0000      Min.   :0.0000     
##  1st Qu.:0.009615           1st Qu.:0.6000      1st Qu.:0.1852     
##  Median :0.015337           Median :0.7105      Median :0.2800     
##  Mean   :0.016613           Mean   :0.6822      Mean   :0.2879     
##  3rd Qu.:0.021739           3rd Qu.:0.8000      3rd Qu.:0.3846     
##  Max.   :0.184932           Max.   :1.0000      Max.   :1.0000     
##  avg_positive_polarity min_positive_polarity max_positive_polarity
##  Min.   :0.0000        Min.   :0.00000       Min.   :0.0000       
##  1st Qu.:0.3062        1st Qu.:0.05000       1st Qu.:0.6000       
##  Median :0.3588        Median :0.10000       Median :0.8000       
##  Mean   :0.3538        Mean   :0.09545       Mean   :0.7567       
##  3rd Qu.:0.4114        3rd Qu.:0.10000       3rd Qu.:1.0000       
##  Max.   :1.0000        Max.   :1.00000       Max.   :1.0000       
##  avg_negative_polarity min_negative_polarity max_negative_polarity
##  Min.   :-1.0000       Min.   :-1.000        Min.   :-1.0000      
##  1st Qu.:-0.3284       1st Qu.:-0.700        1st Qu.:-0.1250      
##  Median :-0.2533       Median :-0.500        Median :-0.1000      
##  Mean   :-0.2595       Mean   :-0.522        Mean   :-0.1075      
##  3rd Qu.:-0.1869       3rd Qu.:-0.300        3rd Qu.:-0.0500      
##  Max.   : 0.0000       Max.   : 0.000        Max.   : 0.0000      
##  title_subjectivity title_sentiment_polarity abs_title_subjectivity
##  Min.   :0.0000     Min.   :-1.00000         Min.   :0.0000        
##  1st Qu.:0.0000     1st Qu.: 0.00000         1st Qu.:0.1667        
##  Median :0.1500     Median : 0.00000         Median :0.5000        
##  Mean   :0.2824     Mean   : 0.07143         Mean   :0.3419        
##  3rd Qu.:0.5000     3rd Qu.: 0.15000         3rd Qu.:0.5000        
##  Max.   :1.0000     Max.   : 1.00000         Max.   :0.5000        
##  abs_title_sentiment_polarity     shares      
##  Min.   :0.0000               Min.   :     1  
##  1st Qu.:0.0000               1st Qu.:   946  
##  Median :0.0000               Median :  1400  
##  Mean   :0.1561               Mean   :  3395  
##  3rd Qu.:0.2500               3rd Qu.:  2800  
##  Max.   :1.0000               Max.   :843300

#lookinf more into our target variable
summary(df1$shares)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1     946    1400    3395    2800  843300

describe(df1$shares)

##    vars     n    mean       sd median trimmed    mad min    max  range  skew
## X1    1 39643 3395.32 11627.09   1400 1888.71 889.56   1 843300 843299 33.96
##    kurtosis   se
## X1  1832.31 58.4

Popularity categories distribution table

library(ggplot2)
# Visualize the feature of different day of week
columns_day <- names(df1)[30:36]
unpop <- df1[df1$shares <1400, ]
pop <- df1[df1$shares>=1400, ]
unpop_day <- colSums(unpop[, columns_day])
pop_day <- colSums(pop[, columns_day])

df_freq <- data.frame(Class = c("Popular", "Non-popular"), Frequency = c(nrow(pop), nrow(unpop)))

ggplot(df_freq, aes(x = Class, y = Frequency, fill = Class)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency Distribution of Popular and Non-Popular Articles", x = "", y = "Frequency") +
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title = element_text(size = 14, face = "bold"))

Weekday distribution table

# create a data frame with the day columns, popular counts, and unpopular counts
df_day_counts <- data.frame(
  Day = columns_day,
  Popular = pop_day,
  Unpopular = unpop_day
)

# convert the data frame to long format
df_day_counts_long <- tidyr::pivot_longer(df_day_counts, cols = c("Popular", "Unpopular"), names_to = "Category", values_to = "Count")

# create the grouped bar plot
ggplot(df_day_counts_long, aes(x = Day, y = Count, fill = Category)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(title = "Count of popular/unpopular news over different day of week", x = "Days of week", y = "Count") + 
  theme(plot.title = element_text(size = 16)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme(axis.title = element_text(size = 12)) + 
  scale_fill_manual(values = c("red", "blue"), labels = c("popular", "unpopular")) + 
  guides(fill = guide_legend(title = NULL))

  #theme(plot.margin = margin(1, 1, 1, 1, "cm"))

Data channels distribution table

library(ggplot2)
# Visualize the feature of different article category
columns_chan <- names(df1)[12:17]
unpop_chan <- colSums(unpop[, columns_chan])
pop_chan <- colSums(pop[, columns_chan])

# Create a data frame for the grouped bar plot
df_grouped <- data.frame(category = columns_chan,
                         popular = pop_chan,
                         unpopular = unpop_chan)



# convert the data frame to long format
df_channel <- tidyr::pivot_longer(df_grouped, cols = c("popular", "unpopular"), names_to = "Category", values_to = "Count")

# create the grouped bar plot
ggplot(df_channel, aes(x = category, y = Count, fill = Category)) +
  geom_bar(position = "dodge", stat = "identity") +
  labs(title = "Count of popular/unpopular news over different channels", x = "Different channels", y = "Count") + 
  theme(plot.title = element_text(size = 16)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme(axis.title = element_text(size = 12)) + 
  scale_fill_manual(values = c("green", "orange"), labels = c("popular", "unpopular")) + 
  guides(fill = guide_legend(title = NULL))

  #theme(plot.margin = margin(1, 1, 1, 1, "cm"))

#further prepocessing tasks

df1$shares <- log(df1$shares)
summary(df1$shares)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   6.852   7.244   7.475   7.937  13.645

Model development for regression linear regression random forests gradient boosting machines,

library(randomForest)
library(rpart)
library(ISLR)
library(dplyr)
library(xgboost)

n <- 10
B <- 20

lm_rmse <- c()
RF_RMSPE <-c()
RMSPE_B <-c()

for (i in 1:n) {
ind <- sample(nrow(df1), nrow(df1), replace = TRUE)
train <- df1[ind, ]
test <- df1[-ind, ]

yhat <- matrix(0, nrow(test), B)

#Random Forest  
rf_model <- randomForest(shares ~., ntree = B, data = train) 
yhat1 <- predict(rf_model, test)
RF_RMSPE[i] <- sqrt(mean((test$shares - yhat1)^2))

# localImp = TRUE
## comparing with parametric model
lm_model <- lm(shares ~ ., data=train) ##what about glm
lm_pred <- predict(lm_model, test)
lm_rmse[i] <- sqrt(mean((lm_pred - test$shares)^2))


#BAGGING
  for (j in 1:B) {
    indBB <- sample(nrow(train), nrow(train), replace = TRUE)
    trBB <- train[indBB, ]
    
    model <- rpart(shares ~ ., data = trBB, method = "anova")
    yhat[ ,j] <- predict(model, test)
  }
  yhatB <- apply(yhat, 1, mean) 
  RMSPE_B[i] <- sqrt(mean((test$shares - yhatB)^2))
}

#average of RF_RMSPE AND LM MODEL
mean(RF_RMSPE)

## [1] 0.8730865

mean(lm_rmse)

## [1] 0.8694631

mean(RMSPE_B)

## [1] 0.8922103

#do i sample or split the data

# create a vector of all RMSPE values
all_RMSPE <- c(RF_RMSPE, lm_rmse, RMSPE_B)

# create a vector of model names
model_names <- c(rep("RF", n), rep("LM", n), rep("BAGGING", n))

# create a data frame with the RMSPE values and model names
df <- data.frame(RMSPE = all_RMSPE, Model = model_names)

# plot the RMSPE values for each model
library(ggplot2)
ggplot(df, aes(x = Model, y = RMSPE, fill = Model)) +
  geom_boxplot() +
  scale_fill_manual(values = c("red", "blue", "green")) +
  labs(x = "Model", y = "RMSPE") +
  ggtitle("RMSPE for Different Models")

Model development for classification

library(randomForest)
library(PASWR)
library(ROCR)
library(rpart)

average <- mean(df1$shares)

nwshares <-ifelse(df1$shares < average, "unpopular", "popular")
newdata <- cbind(df1, nwshares)
newdata <- newdata[, -59]


newdata$nwshares <- as.factor(newdata$nwshares)



df <- newdata #for rpart
dff <- df[complete.cases(df),] # for rf

n <- 10
B <- 20
AUC1 <- c()
AUC2 <- c()
AUC3 <- c()

for (i in 1:n) {
  ind <- sample(nrow(df), nrow(df), replace = TRUE)
  train <- df[ind, ]
  test <- df[-ind, ]
  
  ind <- sample(nrow(dff), nrow(dff), replace = TRUE)
  train2 <- dff[ind, ]
  test2 <- dff[-ind, ]
  
  p = ncol(train2)-1

  model1 <- rpart(nwshares~.,
                data=train2, method="class") #cart
  model2 <- randomForest(nwshares~.,
                         ntree = B, mtry = p, data = train2) #Bagged
  model3 <- randomForest(nwshares~.,
                         ntree = B, data = train2) # RF    
  
  phat1 <- predict(model1, test2, type = "prob")
  phat2 <- predict(model2, test2, type = "prob")
  phat3 <- predict(model3, test2, type = "prob")
  
  #AUC1
  pred_rocr1 <- prediction(phat1[,2], test2$nwshares)
  auc_ROCR1 <- performance(pred_rocr1, measure = "auc")
  AUC1[i] <- auc_ROCR1@y.values[[1]]
  
  #AUC2
  pred_rocr2 <- prediction(phat2[,2], test2$nwshares)
  auc_ROCR2 <- performance(pred_rocr2, measure = "auc")
  AUC2[i] <- auc_ROCR2@y.values[[1]]
  
  #AUC3
  pred_rocr3 <- prediction(phat3[,2], test2$nwshares)
  auc_ROCR3 <- performance(pred_rocr3, measure = "auc")
  AUC3[i] <- auc_ROCR3@y.values[[1]]
}




mean(AUC1)

## [1] 0.6183209

mean(AUC2)

## [1] 0.6839123

mean(AUC3)

## [1] 0.6871194

sd(AUC1)

## [1] 0.01009076

sd(AUC2)

## [1] 0.00277586

sd(AUC3)

## [1] 0.003396944

# Create a data frame with the AUC values for each model
auc_data <- data.frame(
  Model = c("CART", "Bagged", "Random Forest"),
  AUC = c(mean(AUC1), mean(AUC2), mean(AUC3))
)

# Create a bar chart of the AUC values for each model
library(ggplot2)
ggplot(auc_data, aes(x = Model, y = AUC, fill = Model)) +
  geom_bar(stat = "identity", color = "black") +
  ggtitle("AUC for Three Models") +
  xlab("Model") +
  ylab("AUC") +
  theme(legend.position = "none")

# Calculate uncertainty values for each model
UN1 <- 1 - AUC1
UN2 <- 1 - AUC2
UN3 <- 1 - AUC3

mean(UN1)

## [1] 0.3816791

mean(UN2)

## [1] 0.3160877

mean(UN3)

## [1] 0.3128806

##JOUSBOOST USING 1,-1
thresh <- mean(df1$shares)
Y <-ifelse(df1$shares > thresh, 1, -1)
bstdata <- cbind(df1, Y)
bstdata <- bstdata[, -59]

#DF$Y <- ifelse(df$y == 0, -1, 1)

#adaboost X's as a matrix
library(mltools)
library(data.table)

df_hot <- one_hot(as.data.table(bstdata))

library(JOUSBoost)
MAUC <- c()
rnd <- seq(50, 200, 50)


for (r in 1:length(rnd)) {
  aucc <- c()
  for (i in 1:10) {
    ind <- sample(nrow(df_hot), nrow(df_hot), replace = TRUE)
    train <- df_hot[ind, ]
    val <- df_hot[-ind,]
    
    ada <- adaboost(as.matrix(train[, -"Y"]),
                    train$Y, tree_depth = 1,
                    n_rounds = 50)
    
    phat <- predict(ada, val, type = "prob")
    
    pred_rocR <- prediction(phat, val$Y)
    auc_ROCR <- performance(pred_rocR, measure = "auc")
    aucc[i] <- auc_ROCR@y.values[[1]]
  }
  MAUC[r] <- mean(aucc)
}

max(aucc)

## [1] 0.7096108

Model interpretation

library(randomForestExplainer)
#checking the importance and confusion matrix
#CLASSIFICATION
model1$importance

## NULL

#FOR BAGGED
varImpPlot(model2)

importance_frame <- measure_importance(model2)

## [1] "Warning: your forest does not contain information on local importance so 'accuracy_decrease' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."

importance_frame

##                         variable mean_min_depth no_of_nodes gini_decrease
## 1   abs_title_sentiment_polarity           6.30         730     175.88612
## 2         abs_title_subjectivity           6.30         823     197.88940
## 3           average_token_length           4.90        2408     599.24921
## 4          avg_negative_polarity           5.20        1939     481.62915
## 5          avg_positive_polarity           4.95        2116     533.80894
## 6            data_channel_is_bus          10.05          92      18.37003
## 7  data_channel_is_entertainment           1.55          81     179.14631
## 8      data_channel_is_lifestyle           8.40         169      37.09159
## 9         data_channel_is_socmed           2.55         136     157.36397
## 10          data_channel_is_tech           1.80          99     169.93335
## 11         data_channel_is_world           9.95         110      22.98580
## 12    global_rate_negative_words           4.70        1725     434.11959
## 13    global_rate_positive_words           5.40        2029     515.85172
## 14     global_sentiment_polarity           5.45        1848     459.57302
## 15           global_subjectivity           4.95        2181     554.49273
## 16                    is_weekend           2.75         212     155.88792
## 17                    kw_avg_avg           0.00        2425    1422.30383
## 18                    kw_avg_max           4.05        2423     637.22205
## 19                    kw_avg_min           4.50        2063     549.89822
## 20                    kw_max_avg           4.25        2448     656.16505
## 21                    kw_max_max           3.95         379     155.19740
## 22                    kw_max_min           4.60        1984     484.12168
## 23                    kw_min_avg           3.55        1359     386.54789
## 24                    kw_min_max           5.10        1179     298.92844
## 25                    kw_min_min           6.90         250      61.91794
## 26                        LDA_00           4.10        2211     621.27472
## 27                        LDA_01           4.55        2386     606.51774
## 28                        LDA_02           3.85        2163     564.98857
## 29                        LDA_03           4.85        2120     530.08050
## 30                        LDA_04           4.65        2280     583.52389
## 31         max_negative_polarity           5.95        1156     246.12376
## 32         max_positive_polarity           6.85         693     157.54915
## 33         min_negative_polarity           5.25         921     211.58774
## 34         min_positive_polarity           4.40         895     229.39192
## 35      n_non_stop_unique_tokens           4.10        1990     566.68550
## 36              n_non_stop_words           4.80        1416     355.87255
## 37              n_tokens_content           4.25        1393     375.26570
## 38                n_tokens_title           5.45        1605     348.24349
## 39               n_unique_tokens           3.45        1654     482.37754
## 40                     num_hrefs           5.10        1565     392.85623
## 41                      num_imgs           5.60        1077     257.32666
## 42                  num_keywords           6.90         553     113.24019
## 43                num_self_hrefs           6.05        1111     235.24689
## 44                    num_videos           6.35         770     175.77564
## 45           rate_negative_words           5.40        1002     224.74378
## 46           rate_positive_words           6.40        1013     220.57527
## 47    self_reference_avg_sharess           3.10        1472     428.84535
## 48     self_reference_max_shares           4.95        1303     336.29946
## 49     self_reference_min_shares           1.60        1491     612.37870
## 50      title_sentiment_polarity           5.50        1342     288.96890
## 51            title_subjectivity           5.75         873     214.23341
## 52             weekday_is_friday           8.55         263      48.54308
## 53             weekday_is_monday           8.70         255      54.23933
## 54           weekday_is_saturday           4.15         193      66.16877
## 55             weekday_is_sunday           8.45         149      29.11311
## 56           weekday_is_thursday           8.00         264      56.17353
## 57            weekday_is_tuesday           8.70         242      55.68694
## 58          weekday_is_wednesday           7.80         279      63.92562
##    no_of_trees times_a_root       p_value
## 1           20            0  1.000000e+00
## 2           20            0  1.000000e+00
## 3           20            0 3.512326e-213
## 4           20            0  1.283028e-88
## 5           20            0 2.980437e-130
## 6           20            0  1.000000e+00
## 7           20            0  1.000000e+00
## 8           20            0  1.000000e+00
## 9           20            0  1.000000e+00
## 10          20            0  1.000000e+00
## 11          20            0  1.000000e+00
## 12          20            0  5.767971e-48
## 13          20            0 6.311421e-109
## 14          20            0  5.614276e-70
## 15          20            0 3.118287e-147
## 16          20            0  1.000000e+00
## 17          20           20 1.615906e-218
## 18          20            0 6.904089e-218
## 19          20            0 4.574608e-117
## 20          20            0 7.986184e-226
## 21          20            0  1.000000e+00
## 22          20            0  1.519578e-98
## 23          20            0  1.466653e-06
## 24          20            0  6.832697e-01
## 25          20            0  1.000000e+00
## 26          20            0 2.306134e-155
## 27          20            0 2.357388e-206
## 28          20            0 1.923284e-142
## 29          20            0 2.849979e-131
## 30          20            0 9.410657e-175
## 31          20            0  8.755924e-01
## 32          20            0  1.000000e+00
## 33          20            0  1.000000e+00
## 34          20            0  1.000000e+00
## 35          20            0 6.661748e-100
## 36          20            0  1.935765e-10
## 37          20            0  9.400985e-09
## 38          20            0  3.104245e-30
## 39          20            0  5.182808e-37
## 40          20            0  3.439695e-25
## 41          20            0  9.997766e-01
## 42          20            0  1.000000e+00
## 43          20            0  9.936026e-01
## 44          20            0  1.000000e+00
## 45          20            0  1.000000e+00
## 46          20            0  1.000000e+00
## 47          20            0  3.231495e-15
## 48          20            0  9.741551e-04
## 49          20            0  4.741323e-17
## 50          20            0  1.341195e-05
## 51          20            0  1.000000e+00
## 52          20            0  1.000000e+00
## 53          20            0  1.000000e+00
## 54          20            0  1.000000e+00
## 55          20            0  1.000000e+00
## 56          20            0  1.000000e+00
## 57          20            0  1.000000e+00
## 58          20            0  1.000000e+00

#FOR RANDOM FOREST
varImpPlot(model3)

importance_frame <- measure_importance(model3)

## [1] "Warning: your forest does not contain information on local importance so 'accuracy_decrease' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."

importance_frame

##                         variable mean_min_depth no_of_nodes gini_decrease
## 1   abs_title_sentiment_polarity           5.60        1347     223.45907
## 2         abs_title_subjectivity           5.75        1375     241.39813
## 3           average_token_length           4.50        2617     472.51235
## 4          avg_negative_polarity           4.90        2446     448.57534
## 5          avg_positive_polarity           5.50        2615     481.19434
## 6            data_channel_is_bus           7.50         244      49.15486
## 7  data_channel_is_entertainment           4.05         211     139.96816
## 8      data_channel_is_lifestyle           9.35         179      27.21358
## 9         data_channel_is_socmed           4.95         202      89.20008
## 10          data_channel_is_tech           5.50         299      89.95859
## 11         data_channel_is_world           6.10         201      83.17301
## 12    global_rate_negative_words           5.05        2371     415.06913
## 13    global_rate_positive_words           5.00        2515     448.63125
## 14     global_sentiment_polarity           4.70        2493     454.66263
## 15           global_subjectivity           5.05        2573     489.62904
## 16                    is_weekend           3.10         297     137.08441
## 17                    kw_avg_avg           2.45        2994     891.81498
## 18                    kw_avg_max           4.20        2929     563.06035
## 19                    kw_avg_min           4.55        2780     522.48042
## 20                    kw_max_avg           2.55        2942     768.01355
## 21                    kw_max_max           4.90         663     143.92281
## 22                    kw_max_min           4.05        2740     526.48079
## 23                    kw_min_avg           3.95        1914     431.95954
## 24                    kw_min_max           4.75        1711     322.47131
## 25                    kw_min_min           6.70         598      99.83154
## 26                        LDA_00           3.90        2845     553.17881
## 27                        LDA_01           4.25        2716     534.02020
## 28                        LDA_02           3.15        2821     647.48997
## 29                        LDA_03           5.20        2682     492.31013
## 30                        LDA_04           4.25        2777     526.80679
## 31         max_negative_polarity           6.25        1631     264.16100
## 32         max_positive_polarity           6.60        1143     176.72285
## 33         min_negative_polarity           5.45        1614     263.38545
## 34         min_positive_polarity           4.65        1320     247.13563
## 35      n_non_stop_unique_tokens           4.15        2692     507.40346
## 36              n_non_stop_words           4.25        2458     464.83665
## 37              n_tokens_content           3.95        2477     475.45621
## 38                n_tokens_title           5.75        1871     301.50136
## 39               n_unique_tokens           3.90        2528     486.38239
## 40                     num_hrefs           4.35        2131     399.37030
## 41                      num_imgs           4.60        1444     270.65830
## 42                  num_keywords           5.95        1243     187.66695
## 43                num_self_hrefs           5.30        1463     250.27919
## 44                    num_videos           6.20         951     161.89995
## 45           rate_negative_words           4.75        2193     381.72670
## 46           rate_positive_words           4.85        2134     379.24549
## 47    self_reference_avg_sharess           3.00        2138     550.15048
## 48     self_reference_max_shares           4.30        2129     417.69069
## 49     self_reference_min_shares           1.80        2140     642.07392
## 50      title_sentiment_polarity           5.75        1740     296.83422
## 51            title_subjectivity           5.80        1410     238.44825
## 52             weekday_is_friday           8.35         451      59.87276
## 53             weekday_is_monday           8.70         435      56.07680
## 54           weekday_is_saturday           5.30         217      72.27175
## 55             weekday_is_sunday           6.95         226      43.68045
## 56           weekday_is_thursday           8.00         413      59.85872
## 57            weekday_is_tuesday           7.40         460      63.60599
## 58          weekday_is_wednesday           7.95         400      60.95398
##    no_of_trees times_a_root       p_value
## 1           20            0  1.000000e+00
## 2           20            0  1.000000e+00
## 3           20            0 4.498759e-105
## 4           20            0  3.334453e-73
## 5           20            0 1.135587e-104
## 6           20            0  1.000000e+00
## 7           20            0  1.000000e+00
## 8           20            0  1.000000e+00
## 9           20            1  1.000000e+00
## 10          20            0  1.000000e+00
## 11          20            1  1.000000e+00
## 12          20            0  7.026173e-61
## 13          20            0  1.917326e-85
## 14          20            0  1.896071e-81
## 15          20            1  2.196910e-96
## 16          20            2  1.000000e+00
## 17          20            4 1.656148e-192
## 18          20            0 7.696873e-176
## 19          20            0 4.320664e-140
## 20          20            2 4.018252e-179
## 21          20            0  1.000000e+00
## 22          20            0 4.297922e-131
## 23          20            1  9.005029e-10
## 24          20            0  1.286509e-01
## 25          20            0  1.000000e+00
## 26          20            0 2.937625e-155
## 27          20            0 8.074083e-126
## 28          20            2 1.400731e-149
## 29          20            0 1.645481e-118
## 30          20            0 2.086089e-139
## 31          20            0  8.003544e-01
## 32          20            0  1.000000e+00
## 33          20            0  8.973814e-01
## 34          20            1  1.000000e+00
## 35          20            0 1.220013e-120
## 36          20            0  2.855623e-75
## 37          20            0  1.346736e-78
## 38          20            0  2.933635e-07
## 39          20            0  7.614922e-88
## 40          20            0  1.126198e-28
## 41          20            0  1.000000e+00
## 42          20            0  1.000000e+00
## 43          20            0  9.999998e-01
## 44          20            0  1.000000e+00
## 45          20            0  6.678142e-36
## 46          20            0  5.248490e-29
## 47          20            1  1.883822e-29
## 48          20            0  1.869114e-28
## 49          20            4  1.125400e-29
## 50          20            0  3.277781e-02
## 51          20            0  1.000000e+00
## 52          20            0  1.000000e+00
## 53          20            0  1.000000e+00
## 54          20            0  1.000000e+00
## 55          20            0  1.000000e+00
## 56          20            0  1.000000e+00
## 57          20            0  1.000000e+00
## 58          20            0  1.000000e+00

#FOR REGRESSION
varImpPlot(rf_model)

importance_frame <- measure_importance(rf_model)

## [1] "Warning: your forest does not contain information on local importance so 'mse_increase' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."

importance_frame

##                         variable mean_min_depth no_of_nodes
## 1   abs_title_sentiment_polarity           6.05        3620
## 2         abs_title_subjectivity           6.75        3634
## 3           average_token_length           4.90        7158
## 4          avg_negative_polarity           5.00        6470
## 5          avg_positive_polarity           4.65        7071
## 6            data_channel_is_bus           9.75         576
## 7  data_channel_is_entertainment           3.75         598
## 8      data_channel_is_lifestyle           8.70         394
## 9         data_channel_is_socmed           4.65         358
## 10          data_channel_is_tech           4.15         553
## 11         data_channel_is_world           7.95         572
## 12    global_rate_negative_words           5.25        6308
## 13    global_rate_positive_words           4.40        6802
## 14     global_sentiment_polarity           4.55        6518
## 15           global_subjectivity           4.75        7115
## 16                    is_weekend           2.85         627
## 17                    kw_avg_avg           1.10        7724
## 18                    kw_avg_max           4.20        7536
## 19                    kw_avg_min           4.15        7274
## 20                    kw_max_avg           2.15        7668
## 21                    kw_max_max           4.85        1481
## 22                    kw_max_min           4.55        6995
## 23                    kw_min_avg           3.45        4675
## 24                    kw_min_max           5.40        4350
## 25                    kw_min_min           5.50        1566
## 26                        LDA_00           4.85        7170
## 27                        LDA_01           4.65        7150
## 28                        LDA_02           3.60        6990
## 29                        LDA_03           4.50        7120
## 30                        LDA_04           3.80        7299
## 31         max_negative_polarity           6.10        4358
## 32         max_positive_polarity           6.45        3445
## 33         min_negative_polarity           6.15        4467
## 34         min_positive_polarity           5.50        3563
## 35      n_non_stop_unique_tokens           4.10        6839
## 36              n_non_stop_words           5.10        5939
## 37              n_tokens_content           4.55        6054
## 38                n_tokens_title           5.85        5192
## 39               n_unique_tokens           4.65        6498
## 40                     num_hrefs           4.45        5820
## 41                      num_imgs           4.70        3575
## 42                  num_keywords           6.10        3421
## 43                num_self_hrefs           6.05        4132
## 44                    num_videos           6.00        2363
## 45           rate_negative_words           5.30        5439
## 46           rate_positive_words           5.95        5502
## 47    self_reference_avg_sharess           2.05        5473
## 48     self_reference_max_shares           3.80        5391
## 49     self_reference_min_shares           2.50        5791
## 50      title_sentiment_polarity           6.00        3962
## 51            title_subjectivity           6.30        3942
## 52             weekday_is_friday           8.55        1032
## 53             weekday_is_monday           7.90        1091
## 54           weekday_is_saturday           6.65         371
## 55             weekday_is_sunday           6.20         424
## 56           weekday_is_thursday           8.55        1163
## 57            weekday_is_tuesday           8.40        1181
## 58          weekday_is_wednesday           7.80        1236
##    node_purity_increase no_of_trees times_a_root       p_value
## 1             361.96855          20            0  1.000000e+00
## 2             329.99511          20            0  1.000000e+00
## 3             942.55218          20            0  0.000000e+00
## 4             761.65110          20            0 4.518616e-206
## 5             865.78038          20            0  0.000000e+00
## 6              37.55146          20            0  1.000000e+00
## 7             227.15143          20            0  1.000000e+00
## 8              55.39074          20            0  1.000000e+00
## 9             146.19255          20            0  1.000000e+00
## 10            154.37855          20            0  1.000000e+00
## 11             77.88976          20            0  1.000000e+00
## 12            670.52556          20            0 4.676845e-178
## 13            829.69178          20            0 4.415091e-269
## 14            837.36505          20            0 1.012688e-214
## 15            949.03939          20            0  0.000000e+00
## 16            290.08876          20            0  1.000000e+00
## 17           2477.40845          20            9  0.000000e+00
## 18           1007.33576          20            0  0.000000e+00
## 19            940.40374          20            0  0.000000e+00
## 20           1639.72546          20            4  0.000000e+00
## 21            225.31631          20            0  1.000000e+00
## 22            827.72836          20            0 4.691301e-309
## 23            797.29637          20            1  7.824304e-08
## 24            495.96671          20            0  3.711963e-01
## 25            181.07964          20            0  1.000000e+00
## 26            942.14681          20            0  0.000000e+00
## 27            908.54537          20            0  0.000000e+00
## 28            933.08244          20            0 5.459439e-308
## 29            997.42790          20            2  0.000000e+00
## 30           1003.62482          20            0  0.000000e+00
## 31            411.41419          20            0  3.259535e-01
## 32            292.27921          20            0  1.000000e+00
## 33            364.56348          20            0  1.734615e-02
## 34            363.67230          20            0  1.000000e+00
## 35            967.56319          20            0 1.474929e-276
## 36            674.19418          20            0 3.142811e-121
## 37            696.98487          20            0 7.092232e-138
## 38            575.80919          20            0  4.935300e-38
## 39            846.62502          20            0 4.252466e-211
## 40            754.50557          20            0 4.808385e-105
## 41            500.14904          20            0  1.000000e+00
## 42            270.21904          20            0  1.000000e+00
## 43            358.97631          20            0  9.988032e-01
## 44            303.51098          20            0  1.000000e+00
## 45            536.07648          20            0  1.359999e-60
## 46            529.55093          20            0  3.777166e-67
## 47           1080.02788          20            3  4.315452e-64
## 48            718.99262          20            0  8.175748e-56
## 49           1089.12378          20            1 2.906891e-101
## 50            480.69846          20            0  1.000000e+00
## 51            397.31150          20            0  1.000000e+00
## 52             85.78724          20            0  1.000000e+00
## 53            105.96413          20            0  1.000000e+00
## 54             77.74983          20            0  1.000000e+00
## 55             75.00159          20            0  1.000000e+00
## 56             87.59704          20            0  1.000000e+00
## 57             79.46224          20            0  1.000000e+00
## 58             97.43598          20            0  1.000000e+00

#CONFUSION TABLE
yhat <- ifelse(phat3 > 0.5, 1, 0)
## 5. Build the confusion table.
length(test2$nwshares)

## [1] 14671

ct <- table(test2$nwshares, yhat[,2])

# This function take `ct` and make a better table
gct <- function(x) {
  conf_table <- matrix(0, 2, 2)
  conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
  conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
  conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
  conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
  colnames(conf_table) <- c("Y=1", "Y=0")
  rownames(conf_table) <- c("phat=1", "phat=0")
  conf_table
}

cont <- gct(ct)
cont

##         Y=1  Y=0
## phat=1 6430 2295
## phat=0 2857 3089

accuracy <- sum(diag(cont)) / sum(cont)
accuracy

## [1] 0.648831

Predicting Article Popularity in Online News Media Using Machine Learning

Sourabh Joshi

2023-04-19