The project will involve the following steps: 1. Data preprocessing, 2. Model development for regression, 3. Model development for classification, 4. Model interpretation, 5. Model validation.
library(caret)
library(dplyr)
library(pROC)
#loading the data
library(readr)
data <- read_csv("~/Downloads/OnlineNewsPopularity/OnlineNewsPopularity.csv")
str(data)
## spc_tbl_ [39,644 × 61] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ url : chr [1:39644] "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num [1:39644] 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num [1:39644] 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num [1:39644] 219 255 211 531 1072 ...
## $ n_unique_tokens : num [1:39644] 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num [1:39644] 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num [1:39644] 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num [1:39644] 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num [1:39644] 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num [1:39644] 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num [1:39644] 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num [1:39644] 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num [1:39644] 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num [1:39644] 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num [1:39644] 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num [1:39644] 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num [1:39644] 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num [1:39644] 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num [1:39644] 496 0 918 0 3151 ...
## $ weekday_is_monday : num [1:39644] 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num [1:39644] 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num [1:39644] 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num [1:39644] 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num [1:39644] 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num [1:39644] 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num [1:39644] 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num [1:39644] 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num [1:39644] 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num [1:39644] 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num [1:39644] 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num [1:39644] 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num [1:39644] 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num [1:39644] 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num [1:39644] 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num [1:39644] -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num [1:39644] -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num [1:39644] -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num [1:39644] 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num [1:39644] -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num [1:39644] 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num [1:39644] 0.188 0 0 0 0.136 ...
## $ shares : num [1:39644] 593 711 1500 1200 505 855 556 891 3600 710 ...
## - attr(*, "spec")=
## .. cols(
## .. url = col_character(),
## .. timedelta = col_double(),
## .. n_tokens_title = col_double(),
## .. n_tokens_content = col_double(),
## .. n_unique_tokens = col_double(),
## .. n_non_stop_words = col_double(),
## .. n_non_stop_unique_tokens = col_double(),
## .. num_hrefs = col_double(),
## .. num_self_hrefs = col_double(),
## .. num_imgs = col_double(),
## .. num_videos = col_double(),
## .. average_token_length = col_double(),
## .. num_keywords = col_double(),
## .. data_channel_is_lifestyle = col_double(),
## .. data_channel_is_entertainment = col_double(),
## .. data_channel_is_bus = col_double(),
## .. data_channel_is_socmed = col_double(),
## .. data_channel_is_tech = col_double(),
## .. data_channel_is_world = col_double(),
## .. kw_min_min = col_double(),
## .. kw_max_min = col_double(),
## .. kw_avg_min = col_double(),
## .. kw_min_max = col_double(),
## .. kw_max_max = col_double(),
## .. kw_avg_max = col_double(),
## .. kw_min_avg = col_double(),
## .. kw_max_avg = col_double(),
## .. kw_avg_avg = col_double(),
## .. self_reference_min_shares = col_double(),
## .. self_reference_max_shares = col_double(),
## .. self_reference_avg_sharess = col_double(),
## .. weekday_is_monday = col_double(),
## .. weekday_is_tuesday = col_double(),
## .. weekday_is_wednesday = col_double(),
## .. weekday_is_thursday = col_double(),
## .. weekday_is_friday = col_double(),
## .. weekday_is_saturday = col_double(),
## .. weekday_is_sunday = col_double(),
## .. is_weekend = col_double(),
## .. LDA_00 = col_double(),
## .. LDA_01 = col_double(),
## .. LDA_02 = col_double(),
## .. LDA_03 = col_double(),
## .. LDA_04 = col_double(),
## .. global_subjectivity = col_double(),
## .. global_sentiment_polarity = col_double(),
## .. global_rate_positive_words = col_double(),
## .. global_rate_negative_words = col_double(),
## .. rate_positive_words = col_double(),
## .. rate_negative_words = col_double(),
## .. avg_positive_polarity = col_double(),
## .. min_positive_polarity = col_double(),
## .. max_positive_polarity = col_double(),
## .. avg_negative_polarity = col_double(),
## .. min_negative_polarity = col_double(),
## .. max_negative_polarity = col_double(),
## .. title_subjectivity = col_double(),
## .. title_sentiment_polarity = col_double(),
## .. abs_title_subjectivity = col_double(),
## .. abs_title_sentiment_polarity = col_double(),
## .. shares = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#changing to data frame
df <- data.frame(data)
str(df)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : chr "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : num 593 711 1500 1200 505 855 556 891 3600 710 ...
##summary of the data
summary(df)
## url timedelta n_tokens_title n_tokens_content
## Length:39644 Min. : 8.0 Min. : 2.0 Min. : 0.0
## Class :character 1st Qu.:164.0 1st Qu.: 9.0 1st Qu.: 246.0
## Mode :character Median :339.0 Median :10.0 Median : 409.0
## Mean :354.5 Mean :10.4 Mean : 546.5
## 3rd Qu.:542.0 3rd Qu.:12.0 3rd Qu.: 716.0
## Max. :731.0 Max. :23.0 Max. :8474.0
## n_unique_tokens n_non_stop_words n_non_stop_unique_tokens
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.4709 1st Qu.: 1.0000 1st Qu.: 0.6257
## Median : 0.5392 Median : 1.0000 Median : 0.6905
## Mean : 0.5482 Mean : 0.9965 Mean : 0.6892
## 3rd Qu.: 0.6087 3rd Qu.: 1.0000 3rd Qu.: 0.7546
## Max. :701.0000 Max. :1042.0000 Max. :650.0000
## num_hrefs num_self_hrefs num_imgs num_videos
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4.00 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 0.00
## Median : 8.00 Median : 3.000 Median : 1.000 Median : 0.00
## Mean : 10.88 Mean : 3.294 Mean : 4.544 Mean : 1.25
## 3rd Qu.: 14.00 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 1.00
## Max. :304.00 Max. :116.000 Max. :128.000 Max. :91.00
## average_token_length num_keywords data_channel_is_lifestyle
## Min. :0.000 Min. : 1.000 Min. :0.00000
## 1st Qu.:4.478 1st Qu.: 6.000 1st Qu.:0.00000
## Median :4.664 Median : 7.000 Median :0.00000
## Mean :4.548 Mean : 7.224 Mean :0.05295
## 3rd Qu.:4.855 3rd Qu.: 9.000 3rd Qu.:0.00000
## Max. :8.042 Max. :10.000 Max. :1.00000
## data_channel_is_entertainment data_channel_is_bus data_channel_is_socmed
## Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.178 Mean :0.1579 Mean :0.0586
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000
## data_channel_is_tech data_channel_is_world kw_min_min kw_max_min
## Min. :0.0000 Min. :0.0000 Min. : -1.00 Min. : 0
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: -1.00 1st Qu.: 445
## Median :0.0000 Median :0.0000 Median : -1.00 Median : 660
## Mean :0.1853 Mean :0.2126 Mean : 26.11 Mean : 1154
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 4.00 3rd Qu.: 1000
## Max. :1.0000 Max. :1.0000 Max. :377.00 Max. :298400
## kw_avg_min kw_min_max kw_max_max kw_avg_max
## Min. : -1.0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 141.8 1st Qu.: 0 1st Qu.:843300 1st Qu.:172847
## Median : 235.5 Median : 1400 Median :843300 Median :244572
## Mean : 312.4 Mean : 13612 Mean :752324 Mean :259282
## 3rd Qu.: 357.0 3rd Qu.: 7900 3rd Qu.:843300 3rd Qu.:330980
## Max. :42827.9 Max. :843300 Max. :843300 Max. :843300
## kw_min_avg kw_max_avg kw_avg_avg self_reference_min_shares
## Min. : -1 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 3562 1st Qu.: 2382 1st Qu.: 639
## Median :1024 Median : 4356 Median : 2870 Median : 1200
## Mean :1117 Mean : 5657 Mean : 3136 Mean : 3999
## 3rd Qu.:2057 3rd Qu.: 6020 3rd Qu.: 3600 3rd Qu.: 2600
## Max. :3613 Max. :298400 Max. :43568 Max. :843300
## self_reference_max_shares self_reference_avg_sharess weekday_is_monday
## Min. : 0 Min. : 0.0 Min. :0.000
## 1st Qu.: 1100 1st Qu.: 981.2 1st Qu.:0.000
## Median : 2800 Median : 2200.0 Median :0.000
## Mean : 10329 Mean : 6401.7 Mean :0.168
## 3rd Qu.: 8000 3rd Qu.: 5200.0 3rd Qu.:0.000
## Max. :843300 Max. :843300.0 Max. :1.000
## weekday_is_tuesday weekday_is_wednesday weekday_is_thursday weekday_is_friday
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1864 Mean :0.1875 Mean :0.1833 Mean :0.1438
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weekday_is_saturday weekday_is_sunday is_weekend LDA_00
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.02505
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.03339
## Mean :0.06188 Mean :0.06904 Mean :0.1309 Mean :0.18460
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.24096
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :0.92699
## LDA_01 LDA_02 LDA_03 LDA_04
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.02501 1st Qu.:0.02857 1st Qu.:0.02857 1st Qu.:0.02857
## Median :0.03334 Median :0.04000 Median :0.04000 Median :0.04073
## Mean :0.14126 Mean :0.21632 Mean :0.22377 Mean :0.23403
## 3rd Qu.:0.15083 3rd Qu.:0.33422 3rd Qu.:0.37576 3rd Qu.:0.39999
## Max. :0.92595 Max. :0.92000 Max. :0.92653 Max. :0.92719
## global_subjectivity global_sentiment_polarity global_rate_positive_words
## Min. :0.0000 Min. :-0.39375 Min. :0.00000
## 1st Qu.:0.3962 1st Qu.: 0.05776 1st Qu.:0.02838
## Median :0.4535 Median : 0.11912 Median :0.03902
## Mean :0.4434 Mean : 0.11931 Mean :0.03962
## 3rd Qu.:0.5083 3rd Qu.: 0.17783 3rd Qu.:0.05028
## Max. :1.0000 Max. : 0.72784 Max. :0.15549
## global_rate_negative_words rate_positive_words rate_negative_words
## Min. :0.000000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.009615 1st Qu.:0.6000 1st Qu.:0.1852
## Median :0.015337 Median :0.7105 Median :0.2800
## Mean :0.016612 Mean :0.6822 Mean :0.2879
## 3rd Qu.:0.021739 3rd Qu.:0.8000 3rd Qu.:0.3846
## Max. :0.184932 Max. :1.0000 Max. :1.0000
## avg_positive_polarity min_positive_polarity max_positive_polarity
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3062 1st Qu.:0.05000 1st Qu.:0.6000
## Median :0.3588 Median :0.10000 Median :0.8000
## Mean :0.3538 Mean :0.09545 Mean :0.7567
## 3rd Qu.:0.4114 3rd Qu.:0.10000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## avg_negative_polarity min_negative_polarity max_negative_polarity
## Min. :-1.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:-0.3284 1st Qu.:-0.7000 1st Qu.:-0.1250
## Median :-0.2533 Median :-0.5000 Median :-0.1000
## Mean :-0.2595 Mean :-0.5219 Mean :-0.1075
## 3rd Qu.:-0.1869 3rd Qu.:-0.3000 3rd Qu.:-0.0500
## Max. : 0.0000 Max. : 0.0000 Max. : 0.0000
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## Min. :0.0000 Min. :-1.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.1667
## Median :0.1500 Median : 0.00000 Median :0.5000
## Mean :0.2824 Mean : 0.07143 Mean :0.3418
## 3rd Qu.:0.5000 3rd Qu.: 0.15000 3rd Qu.:0.5000
## Max. :1.0000 Max. : 1.00000 Max. :0.5000
## abs_title_sentiment_polarity shares
## Min. :0.0000 Min. : 1
## 1st Qu.:0.0000 1st Qu.: 946
## Median :0.0000 Median : 1400
## Mean :0.1561 Mean : 3395
## 3rd Qu.:0.2500 3rd Qu.: 2800
## Max. :1.0000 Max. :843300
#missing values
anyNA(df)
## [1] FALSE
#checking for outliers
df=df[!df$n_non_stop_words==1042,]
summary(df$n_non_stop_words) #fixed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.9702 1.0000 1.0000
#dropiing the url variable & timedelta
df1 <- df[, -1]
df1 <- df1[,-1]
# Any errors
#Checking importance of news subjects(categorical) on shares
for (i in 13:18){
boxplot(log(df1$shares) ~ (df1[,i]), xlab=names(df1)[i] , ylab="shares")
}
#Checking importance of weekdays on news shares
for (i in 31:37){
boxplot(log(df1$shares) ~ (df1[,i]), xlab=names(df1)[i] , ylab="shares")
}
#Converting categorical values from numeric to factor - Weekdays
#for (i in 31:37){
# df1[,i] <- factor(df1[,i])
#}
Summary statistics
library(psych)
summary(df1)
## n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
## Min. : 2.0 Min. : 0.0 Min. :0.0000 Min. :0.0000
## 1st Qu.: 9.0 1st Qu.: 246.0 1st Qu.:0.4709 1st Qu.:1.0000
## Median :10.0 Median : 409.0 Median :0.5392 Median :1.0000
## Mean :10.4 Mean : 546.5 Mean :0.5305 Mean :0.9702
## 3rd Qu.:12.0 3rd Qu.: 716.0 3rd Qu.:0.6087 3rd Qu.:1.0000
## Max. :23.0 Max. :8474.0 Max. :1.0000 Max. :1.0000
## n_non_stop_unique_tokens num_hrefs num_self_hrefs num_imgs
## Min. :0.0000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.6257 1st Qu.: 4.00 1st Qu.: 1.000 1st Qu.: 1.000
## Median :0.6905 Median : 8.00 Median : 3.000 Median : 1.000
## Mean :0.6728 Mean : 10.88 Mean : 3.293 Mean : 4.543
## 3rd Qu.:0.7546 3rd Qu.: 14.00 3rd Qu.: 4.000 3rd Qu.: 4.000
## Max. :1.0000 Max. :304.00 Max. :116.000 Max. :128.000
## num_videos average_token_length num_keywords
## Min. : 0.00 Min. :0.000 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.:4.478 1st Qu.: 6.000
## Median : 0.00 Median :4.664 Median : 7.000
## Mean : 1.25 Mean :4.548 Mean : 7.224
## 3rd Qu.: 1.00 3rd Qu.:4.855 3rd Qu.: 9.000
## Max. :91.00 Max. :8.042 Max. :10.000
## data_channel_is_lifestyle data_channel_is_entertainment data_channel_is_bus
## Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.00000 Median :0.000 Median :0.0000
## Mean :0.05295 Mean :0.178 Mean :0.1579
## 3rd Qu.:0.00000 3rd Qu.:0.000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.000 Max. :1.0000
## data_channel_is_socmed data_channel_is_tech data_channel_is_world
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.0586 Mean :0.1853 Mean :0.2126
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## kw_min_min kw_max_min kw_avg_min kw_min_max
## Min. : -1.00 Min. : 0 Min. : -1.0 Min. : 0
## 1st Qu.: -1.00 1st Qu.: 445 1st Qu.: 141.8 1st Qu.: 0
## Median : -1.00 Median : 660 Median : 235.5 Median : 1400
## Mean : 26.11 Mean : 1154 Mean : 312.4 Mean : 13612
## 3rd Qu.: 4.00 3rd Qu.: 1000 3rd Qu.: 357.0 3rd Qu.: 7900
## Max. :377.00 Max. :298400 Max. :42827.9 Max. :843300
## kw_max_max kw_avg_max kw_min_avg kw_max_avg
## Min. : 0 Min. : 0 Min. : -1 Min. : 0
## 1st Qu.:843300 1st Qu.:172844 1st Qu.: 0 1st Qu.: 3562
## Median :843300 Median :244567 Median :1024 Median : 4356
## Mean :752322 Mean :259280 Mean :1117 Mean : 5657
## 3rd Qu.:843300 3rd Qu.:330980 3rd Qu.:2057 3rd Qu.: 6020
## Max. :843300 Max. :843300 Max. :3613 Max. :298400
## kw_avg_avg self_reference_min_shares self_reference_max_shares
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 2382 1st Qu.: 639 1st Qu.: 1100
## Median : 2870 Median : 1200 Median : 2800
## Mean : 3136 Mean : 3999 Mean : 10330
## 3rd Qu.: 3600 3rd Qu.: 2600 3rd Qu.: 8000
## Max. :43568 Max. :843300 Max. :843300
## self_reference_avg_sharess weekday_is_monday weekday_is_tuesday
## Min. : 0.0 Min. :0.000 Min. :0.0000
## 1st Qu.: 981.1 1st Qu.:0.000 1st Qu.:0.0000
## Median : 2200.0 Median :0.000 Median :0.0000
## Mean : 6401.7 Mean :0.168 Mean :0.1864
## 3rd Qu.: 5200.0 3rd Qu.:0.000 3rd Qu.:0.0000
## Max. :843300.0 Max. :1.000 Max. :1.0000
## weekday_is_wednesday weekday_is_thursday weekday_is_friday weekday_is_saturday
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1875 Mean :0.1833 Mean :0.1438 Mean :0.06188
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## weekday_is_sunday is_weekend LDA_00 LDA_01
## Min. :0.00000 Min. :0.0000 Min. :0.01818 Min. :0.01818
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.02505 1st Qu.:0.02501
## Median :0.00000 Median :0.0000 Median :0.03339 Median :0.03334
## Mean :0.06904 Mean :0.1309 Mean :0.18460 Mean :0.14126
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.24097 3rd Qu.:0.15084
## Max. :1.00000 Max. :1.0000 Max. :0.92699 Max. :0.92595
## LDA_02 LDA_03 LDA_04 global_subjectivity
## Min. :0.01818 Min. :0.01818 Min. :0.01818 Min. :0.0000
## 1st Qu.:0.02857 1st Qu.:0.02857 1st Qu.:0.02857 1st Qu.:0.3962
## Median :0.04000 Median :0.04000 Median :0.04073 Median :0.4535
## Mean :0.21633 Mean :0.22378 Mean :0.23404 Mean :0.4434
## 3rd Qu.:0.33422 3rd Qu.:0.37578 3rd Qu.:0.39999 3rd Qu.:0.5083
## Max. :0.92000 Max. :0.92653 Max. :0.92719 Max. :1.0000
## global_sentiment_polarity global_rate_positive_words
## Min. :-0.39375 Min. :0.00000
## 1st Qu.: 0.05776 1st Qu.:0.02839
## Median : 0.11912 Median :0.03902
## Mean : 0.11931 Mean :0.03963
## 3rd Qu.: 0.17784 3rd Qu.:0.05028
## Max. : 0.72784 Max. :0.15549
## global_rate_negative_words rate_positive_words rate_negative_words
## Min. :0.000000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.009615 1st Qu.:0.6000 1st Qu.:0.1852
## Median :0.015337 Median :0.7105 Median :0.2800
## Mean :0.016613 Mean :0.6822 Mean :0.2879
## 3rd Qu.:0.021739 3rd Qu.:0.8000 3rd Qu.:0.3846
## Max. :0.184932 Max. :1.0000 Max. :1.0000
## avg_positive_polarity min_positive_polarity max_positive_polarity
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3062 1st Qu.:0.05000 1st Qu.:0.6000
## Median :0.3588 Median :0.10000 Median :0.8000
## Mean :0.3538 Mean :0.09545 Mean :0.7567
## 3rd Qu.:0.4114 3rd Qu.:0.10000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## avg_negative_polarity min_negative_polarity max_negative_polarity
## Min. :-1.0000 Min. :-1.000 Min. :-1.0000
## 1st Qu.:-0.3284 1st Qu.:-0.700 1st Qu.:-0.1250
## Median :-0.2533 Median :-0.500 Median :-0.1000
## Mean :-0.2595 Mean :-0.522 Mean :-0.1075
## 3rd Qu.:-0.1869 3rd Qu.:-0.300 3rd Qu.:-0.0500
## Max. : 0.0000 Max. : 0.000 Max. : 0.0000
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## Min. :0.0000 Min. :-1.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.1667
## Median :0.1500 Median : 0.00000 Median :0.5000
## Mean :0.2824 Mean : 0.07143 Mean :0.3419
## 3rd Qu.:0.5000 3rd Qu.: 0.15000 3rd Qu.:0.5000
## Max. :1.0000 Max. : 1.00000 Max. :0.5000
## abs_title_sentiment_polarity shares
## Min. :0.0000 Min. : 1
## 1st Qu.:0.0000 1st Qu.: 946
## Median :0.0000 Median : 1400
## Mean :0.1561 Mean : 3395
## 3rd Qu.:0.2500 3rd Qu.: 2800
## Max. :1.0000 Max. :843300
#lookinf more into our target variable
summary(df1$shares)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 946 1400 3395 2800 843300
describe(df1$shares)
## vars n mean sd median trimmed mad min max range skew
## X1 1 39643 3395.32 11627.09 1400 1888.71 889.56 1 843300 843299 33.96
## kurtosis se
## X1 1832.31 58.4
Popularity categories distribution table
library(ggplot2)
# Visualize the feature of different day of week
columns_day <- names(df1)[30:36]
unpop <- df1[df1$shares <1400, ]
pop <- df1[df1$shares>=1400, ]
unpop_day <- colSums(unpop[, columns_day])
pop_day <- colSums(pop[, columns_day])
df_freq <- data.frame(Class = c("Popular", "Non-popular"), Frequency = c(nrow(pop), nrow(unpop)))
ggplot(df_freq, aes(x = Class, y = Frequency, fill = Class)) +
geom_bar(stat = "identity") +
labs(title = "Frequency Distribution of Popular and Non-Popular Articles", x = "", y = "Frequency") +
theme(plot.title = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title = element_text(size = 14, face = "bold"))
Weekday distribution table
# create a data frame with the day columns, popular counts, and unpopular counts
df_day_counts <- data.frame(
Day = columns_day,
Popular = pop_day,
Unpopular = unpop_day
)
# convert the data frame to long format
df_day_counts_long <- tidyr::pivot_longer(df_day_counts, cols = c("Popular", "Unpopular"), names_to = "Category", values_to = "Count")
# create the grouped bar plot
ggplot(df_day_counts_long, aes(x = Day, y = Count, fill = Category)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "Count of popular/unpopular news over different day of week", x = "Days of week", y = "Count") +
theme(plot.title = element_text(size = 16)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.title = element_text(size = 12)) +
scale_fill_manual(values = c("red", "blue"), labels = c("popular", "unpopular")) +
guides(fill = guide_legend(title = NULL))
#theme(plot.margin = margin(1, 1, 1, 1, "cm"))
Data channels distribution table
library(ggplot2)
# Visualize the feature of different article category
columns_chan <- names(df1)[12:17]
unpop_chan <- colSums(unpop[, columns_chan])
pop_chan <- colSums(pop[, columns_chan])
# Create a data frame for the grouped bar plot
df_grouped <- data.frame(category = columns_chan,
popular = pop_chan,
unpopular = unpop_chan)
# convert the data frame to long format
df_channel <- tidyr::pivot_longer(df_grouped, cols = c("popular", "unpopular"), names_to = "Category", values_to = "Count")
# create the grouped bar plot
ggplot(df_channel, aes(x = category, y = Count, fill = Category)) +
geom_bar(position = "dodge", stat = "identity") +
labs(title = "Count of popular/unpopular news over different channels", x = "Different channels", y = "Count") +
theme(plot.title = element_text(size = 16)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(axis.title = element_text(size = 12)) +
scale_fill_manual(values = c("green", "orange"), labels = c("popular", "unpopular")) +
guides(fill = guide_legend(title = NULL))
#theme(plot.margin = margin(1, 1, 1, 1, "cm"))
#further prepocessing tasks
df1$shares <- log(df1$shares)
summary(df1$shares)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.852 7.244 7.475 7.937 13.645
library(randomForest)
library(rpart)
library(ISLR)
library(dplyr)
library(xgboost)
n <- 10
B <- 20
lm_rmse <- c()
RF_RMSPE <-c()
RMSPE_B <-c()
for (i in 1:n) {
ind <- sample(nrow(df1), nrow(df1), replace = TRUE)
train <- df1[ind, ]
test <- df1[-ind, ]
yhat <- matrix(0, nrow(test), B)
#Random Forest
rf_model <- randomForest(shares ~., ntree = B, data = train)
yhat1 <- predict(rf_model, test)
RF_RMSPE[i] <- sqrt(mean((test$shares - yhat1)^2))
# localImp = TRUE
## comparing with parametric model
lm_model <- lm(shares ~ ., data=train) ##what about glm
lm_pred <- predict(lm_model, test)
lm_rmse[i] <- sqrt(mean((lm_pred - test$shares)^2))
#BAGGING
for (j in 1:B) {
indBB <- sample(nrow(train), nrow(train), replace = TRUE)
trBB <- train[indBB, ]
model <- rpart(shares ~ ., data = trBB, method = "anova")
yhat[ ,j] <- predict(model, test)
}
yhatB <- apply(yhat, 1, mean)
RMSPE_B[i] <- sqrt(mean((test$shares - yhatB)^2))
}
#average of RF_RMSPE AND LM MODEL
mean(RF_RMSPE)
## [1] 0.8730865
mean(lm_rmse)
## [1] 0.8694631
mean(RMSPE_B)
## [1] 0.8922103
#do i sample or split the data
# create a vector of all RMSPE values
all_RMSPE <- c(RF_RMSPE, lm_rmse, RMSPE_B)
# create a vector of model names
model_names <- c(rep("RF", n), rep("LM", n), rep("BAGGING", n))
# create a data frame with the RMSPE values and model names
df <- data.frame(RMSPE = all_RMSPE, Model = model_names)
# plot the RMSPE values for each model
library(ggplot2)
ggplot(df, aes(x = Model, y = RMSPE, fill = Model)) +
geom_boxplot() +
scale_fill_manual(values = c("red", "blue", "green")) +
labs(x = "Model", y = "RMSPE") +
ggtitle("RMSPE for Different Models")
library(randomForest)
library(PASWR)
library(ROCR)
library(rpart)
average <- mean(df1$shares)
nwshares <-ifelse(df1$shares < average, "unpopular", "popular")
newdata <- cbind(df1, nwshares)
newdata <- newdata[, -59]
newdata$nwshares <- as.factor(newdata$nwshares)
df <- newdata #for rpart
dff <- df[complete.cases(df),] # for rf
n <- 10
B <- 20
AUC1 <- c()
AUC2 <- c()
AUC3 <- c()
for (i in 1:n) {
ind <- sample(nrow(df), nrow(df), replace = TRUE)
train <- df[ind, ]
test <- df[-ind, ]
ind <- sample(nrow(dff), nrow(dff), replace = TRUE)
train2 <- dff[ind, ]
test2 <- dff[-ind, ]
p = ncol(train2)-1
model1 <- rpart(nwshares~.,
data=train2, method="class") #cart
model2 <- randomForest(nwshares~.,
ntree = B, mtry = p, data = train2) #Bagged
model3 <- randomForest(nwshares~.,
ntree = B, data = train2) # RF
phat1 <- predict(model1, test2, type = "prob")
phat2 <- predict(model2, test2, type = "prob")
phat3 <- predict(model3, test2, type = "prob")
#AUC1
pred_rocr1 <- prediction(phat1[,2], test2$nwshares)
auc_ROCR1 <- performance(pred_rocr1, measure = "auc")
AUC1[i] <- auc_ROCR1@y.values[[1]]
#AUC2
pred_rocr2 <- prediction(phat2[,2], test2$nwshares)
auc_ROCR2 <- performance(pred_rocr2, measure = "auc")
AUC2[i] <- auc_ROCR2@y.values[[1]]
#AUC3
pred_rocr3 <- prediction(phat3[,2], test2$nwshares)
auc_ROCR3 <- performance(pred_rocr3, measure = "auc")
AUC3[i] <- auc_ROCR3@y.values[[1]]
}
mean(AUC1)
## [1] 0.6183209
mean(AUC2)
## [1] 0.6839123
mean(AUC3)
## [1] 0.6871194
sd(AUC1)
## [1] 0.01009076
sd(AUC2)
## [1] 0.00277586
sd(AUC3)
## [1] 0.003396944
# Create a data frame with the AUC values for each model
auc_data <- data.frame(
Model = c("CART", "Bagged", "Random Forest"),
AUC = c(mean(AUC1), mean(AUC2), mean(AUC3))
)
# Create a bar chart of the AUC values for each model
library(ggplot2)
ggplot(auc_data, aes(x = Model, y = AUC, fill = Model)) +
geom_bar(stat = "identity", color = "black") +
ggtitle("AUC for Three Models") +
xlab("Model") +
ylab("AUC") +
theme(legend.position = "none")
# Calculate uncertainty values for each model
UN1 <- 1 - AUC1
UN2 <- 1 - AUC2
UN3 <- 1 - AUC3
mean(UN1)
## [1] 0.3816791
mean(UN2)
## [1] 0.3160877
mean(UN3)
## [1] 0.3128806
##JOUSBOOST USING 1,-1
thresh <- mean(df1$shares)
Y <-ifelse(df1$shares > thresh, 1, -1)
bstdata <- cbind(df1, Y)
bstdata <- bstdata[, -59]
#DF$Y <- ifelse(df$y == 0, -1, 1)
#adaboost X's as a matrix
library(mltools)
library(data.table)
df_hot <- one_hot(as.data.table(bstdata))
library(JOUSBoost)
MAUC <- c()
rnd <- seq(50, 200, 50)
for (r in 1:length(rnd)) {
aucc <- c()
for (i in 1:10) {
ind <- sample(nrow(df_hot), nrow(df_hot), replace = TRUE)
train <- df_hot[ind, ]
val <- df_hot[-ind,]
ada <- adaboost(as.matrix(train[, -"Y"]),
train$Y, tree_depth = 1,
n_rounds = 50)
phat <- predict(ada, val, type = "prob")
pred_rocR <- prediction(phat, val$Y)
auc_ROCR <- performance(pred_rocR, measure = "auc")
aucc[i] <- auc_ROCR@y.values[[1]]
}
MAUC[r] <- mean(aucc)
}
max(aucc)
## [1] 0.7096108
library(randomForestExplainer)
#checking the importance and confusion matrix
#CLASSIFICATION
model1$importance
## NULL
#FOR BAGGED
varImpPlot(model2)
importance_frame <- measure_importance(model2)
## [1] "Warning: your forest does not contain information on local importance so 'accuracy_decrease' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."
importance_frame
## variable mean_min_depth no_of_nodes gini_decrease
## 1 abs_title_sentiment_polarity 6.30 730 175.88612
## 2 abs_title_subjectivity 6.30 823 197.88940
## 3 average_token_length 4.90 2408 599.24921
## 4 avg_negative_polarity 5.20 1939 481.62915
## 5 avg_positive_polarity 4.95 2116 533.80894
## 6 data_channel_is_bus 10.05 92 18.37003
## 7 data_channel_is_entertainment 1.55 81 179.14631
## 8 data_channel_is_lifestyle 8.40 169 37.09159
## 9 data_channel_is_socmed 2.55 136 157.36397
## 10 data_channel_is_tech 1.80 99 169.93335
## 11 data_channel_is_world 9.95 110 22.98580
## 12 global_rate_negative_words 4.70 1725 434.11959
## 13 global_rate_positive_words 5.40 2029 515.85172
## 14 global_sentiment_polarity 5.45 1848 459.57302
## 15 global_subjectivity 4.95 2181 554.49273
## 16 is_weekend 2.75 212 155.88792
## 17 kw_avg_avg 0.00 2425 1422.30383
## 18 kw_avg_max 4.05 2423 637.22205
## 19 kw_avg_min 4.50 2063 549.89822
## 20 kw_max_avg 4.25 2448 656.16505
## 21 kw_max_max 3.95 379 155.19740
## 22 kw_max_min 4.60 1984 484.12168
## 23 kw_min_avg 3.55 1359 386.54789
## 24 kw_min_max 5.10 1179 298.92844
## 25 kw_min_min 6.90 250 61.91794
## 26 LDA_00 4.10 2211 621.27472
## 27 LDA_01 4.55 2386 606.51774
## 28 LDA_02 3.85 2163 564.98857
## 29 LDA_03 4.85 2120 530.08050
## 30 LDA_04 4.65 2280 583.52389
## 31 max_negative_polarity 5.95 1156 246.12376
## 32 max_positive_polarity 6.85 693 157.54915
## 33 min_negative_polarity 5.25 921 211.58774
## 34 min_positive_polarity 4.40 895 229.39192
## 35 n_non_stop_unique_tokens 4.10 1990 566.68550
## 36 n_non_stop_words 4.80 1416 355.87255
## 37 n_tokens_content 4.25 1393 375.26570
## 38 n_tokens_title 5.45 1605 348.24349
## 39 n_unique_tokens 3.45 1654 482.37754
## 40 num_hrefs 5.10 1565 392.85623
## 41 num_imgs 5.60 1077 257.32666
## 42 num_keywords 6.90 553 113.24019
## 43 num_self_hrefs 6.05 1111 235.24689
## 44 num_videos 6.35 770 175.77564
## 45 rate_negative_words 5.40 1002 224.74378
## 46 rate_positive_words 6.40 1013 220.57527
## 47 self_reference_avg_sharess 3.10 1472 428.84535
## 48 self_reference_max_shares 4.95 1303 336.29946
## 49 self_reference_min_shares 1.60 1491 612.37870
## 50 title_sentiment_polarity 5.50 1342 288.96890
## 51 title_subjectivity 5.75 873 214.23341
## 52 weekday_is_friday 8.55 263 48.54308
## 53 weekday_is_monday 8.70 255 54.23933
## 54 weekday_is_saturday 4.15 193 66.16877
## 55 weekday_is_sunday 8.45 149 29.11311
## 56 weekday_is_thursday 8.00 264 56.17353
## 57 weekday_is_tuesday 8.70 242 55.68694
## 58 weekday_is_wednesday 7.80 279 63.92562
## no_of_trees times_a_root p_value
## 1 20 0 1.000000e+00
## 2 20 0 1.000000e+00
## 3 20 0 3.512326e-213
## 4 20 0 1.283028e-88
## 5 20 0 2.980437e-130
## 6 20 0 1.000000e+00
## 7 20 0 1.000000e+00
## 8 20 0 1.000000e+00
## 9 20 0 1.000000e+00
## 10 20 0 1.000000e+00
## 11 20 0 1.000000e+00
## 12 20 0 5.767971e-48
## 13 20 0 6.311421e-109
## 14 20 0 5.614276e-70
## 15 20 0 3.118287e-147
## 16 20 0 1.000000e+00
## 17 20 20 1.615906e-218
## 18 20 0 6.904089e-218
## 19 20 0 4.574608e-117
## 20 20 0 7.986184e-226
## 21 20 0 1.000000e+00
## 22 20 0 1.519578e-98
## 23 20 0 1.466653e-06
## 24 20 0 6.832697e-01
## 25 20 0 1.000000e+00
## 26 20 0 2.306134e-155
## 27 20 0 2.357388e-206
## 28 20 0 1.923284e-142
## 29 20 0 2.849979e-131
## 30 20 0 9.410657e-175
## 31 20 0 8.755924e-01
## 32 20 0 1.000000e+00
## 33 20 0 1.000000e+00
## 34 20 0 1.000000e+00
## 35 20 0 6.661748e-100
## 36 20 0 1.935765e-10
## 37 20 0 9.400985e-09
## 38 20 0 3.104245e-30
## 39 20 0 5.182808e-37
## 40 20 0 3.439695e-25
## 41 20 0 9.997766e-01
## 42 20 0 1.000000e+00
## 43 20 0 9.936026e-01
## 44 20 0 1.000000e+00
## 45 20 0 1.000000e+00
## 46 20 0 1.000000e+00
## 47 20 0 3.231495e-15
## 48 20 0 9.741551e-04
## 49 20 0 4.741323e-17
## 50 20 0 1.341195e-05
## 51 20 0 1.000000e+00
## 52 20 0 1.000000e+00
## 53 20 0 1.000000e+00
## 54 20 0 1.000000e+00
## 55 20 0 1.000000e+00
## 56 20 0 1.000000e+00
## 57 20 0 1.000000e+00
## 58 20 0 1.000000e+00
#FOR RANDOM FOREST
varImpPlot(model3)
importance_frame <- measure_importance(model3)
## [1] "Warning: your forest does not contain information on local importance so 'accuracy_decrease' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."
importance_frame
## variable mean_min_depth no_of_nodes gini_decrease
## 1 abs_title_sentiment_polarity 5.60 1347 223.45907
## 2 abs_title_subjectivity 5.75 1375 241.39813
## 3 average_token_length 4.50 2617 472.51235
## 4 avg_negative_polarity 4.90 2446 448.57534
## 5 avg_positive_polarity 5.50 2615 481.19434
## 6 data_channel_is_bus 7.50 244 49.15486
## 7 data_channel_is_entertainment 4.05 211 139.96816
## 8 data_channel_is_lifestyle 9.35 179 27.21358
## 9 data_channel_is_socmed 4.95 202 89.20008
## 10 data_channel_is_tech 5.50 299 89.95859
## 11 data_channel_is_world 6.10 201 83.17301
## 12 global_rate_negative_words 5.05 2371 415.06913
## 13 global_rate_positive_words 5.00 2515 448.63125
## 14 global_sentiment_polarity 4.70 2493 454.66263
## 15 global_subjectivity 5.05 2573 489.62904
## 16 is_weekend 3.10 297 137.08441
## 17 kw_avg_avg 2.45 2994 891.81498
## 18 kw_avg_max 4.20 2929 563.06035
## 19 kw_avg_min 4.55 2780 522.48042
## 20 kw_max_avg 2.55 2942 768.01355
## 21 kw_max_max 4.90 663 143.92281
## 22 kw_max_min 4.05 2740 526.48079
## 23 kw_min_avg 3.95 1914 431.95954
## 24 kw_min_max 4.75 1711 322.47131
## 25 kw_min_min 6.70 598 99.83154
## 26 LDA_00 3.90 2845 553.17881
## 27 LDA_01 4.25 2716 534.02020
## 28 LDA_02 3.15 2821 647.48997
## 29 LDA_03 5.20 2682 492.31013
## 30 LDA_04 4.25 2777 526.80679
## 31 max_negative_polarity 6.25 1631 264.16100
## 32 max_positive_polarity 6.60 1143 176.72285
## 33 min_negative_polarity 5.45 1614 263.38545
## 34 min_positive_polarity 4.65 1320 247.13563
## 35 n_non_stop_unique_tokens 4.15 2692 507.40346
## 36 n_non_stop_words 4.25 2458 464.83665
## 37 n_tokens_content 3.95 2477 475.45621
## 38 n_tokens_title 5.75 1871 301.50136
## 39 n_unique_tokens 3.90 2528 486.38239
## 40 num_hrefs 4.35 2131 399.37030
## 41 num_imgs 4.60 1444 270.65830
## 42 num_keywords 5.95 1243 187.66695
## 43 num_self_hrefs 5.30 1463 250.27919
## 44 num_videos 6.20 951 161.89995
## 45 rate_negative_words 4.75 2193 381.72670
## 46 rate_positive_words 4.85 2134 379.24549
## 47 self_reference_avg_sharess 3.00 2138 550.15048
## 48 self_reference_max_shares 4.30 2129 417.69069
## 49 self_reference_min_shares 1.80 2140 642.07392
## 50 title_sentiment_polarity 5.75 1740 296.83422
## 51 title_subjectivity 5.80 1410 238.44825
## 52 weekday_is_friday 8.35 451 59.87276
## 53 weekday_is_monday 8.70 435 56.07680
## 54 weekday_is_saturday 5.30 217 72.27175
## 55 weekday_is_sunday 6.95 226 43.68045
## 56 weekday_is_thursday 8.00 413 59.85872
## 57 weekday_is_tuesday 7.40 460 63.60599
## 58 weekday_is_wednesday 7.95 400 60.95398
## no_of_trees times_a_root p_value
## 1 20 0 1.000000e+00
## 2 20 0 1.000000e+00
## 3 20 0 4.498759e-105
## 4 20 0 3.334453e-73
## 5 20 0 1.135587e-104
## 6 20 0 1.000000e+00
## 7 20 0 1.000000e+00
## 8 20 0 1.000000e+00
## 9 20 1 1.000000e+00
## 10 20 0 1.000000e+00
## 11 20 1 1.000000e+00
## 12 20 0 7.026173e-61
## 13 20 0 1.917326e-85
## 14 20 0 1.896071e-81
## 15 20 1 2.196910e-96
## 16 20 2 1.000000e+00
## 17 20 4 1.656148e-192
## 18 20 0 7.696873e-176
## 19 20 0 4.320664e-140
## 20 20 2 4.018252e-179
## 21 20 0 1.000000e+00
## 22 20 0 4.297922e-131
## 23 20 1 9.005029e-10
## 24 20 0 1.286509e-01
## 25 20 0 1.000000e+00
## 26 20 0 2.937625e-155
## 27 20 0 8.074083e-126
## 28 20 2 1.400731e-149
## 29 20 0 1.645481e-118
## 30 20 0 2.086089e-139
## 31 20 0 8.003544e-01
## 32 20 0 1.000000e+00
## 33 20 0 8.973814e-01
## 34 20 1 1.000000e+00
## 35 20 0 1.220013e-120
## 36 20 0 2.855623e-75
## 37 20 0 1.346736e-78
## 38 20 0 2.933635e-07
## 39 20 0 7.614922e-88
## 40 20 0 1.126198e-28
## 41 20 0 1.000000e+00
## 42 20 0 1.000000e+00
## 43 20 0 9.999998e-01
## 44 20 0 1.000000e+00
## 45 20 0 6.678142e-36
## 46 20 0 5.248490e-29
## 47 20 1 1.883822e-29
## 48 20 0 1.869114e-28
## 49 20 4 1.125400e-29
## 50 20 0 3.277781e-02
## 51 20 0 1.000000e+00
## 52 20 0 1.000000e+00
## 53 20 0 1.000000e+00
## 54 20 0 1.000000e+00
## 55 20 0 1.000000e+00
## 56 20 0 1.000000e+00
## 57 20 0 1.000000e+00
## 58 20 0 1.000000e+00
#FOR REGRESSION
varImpPlot(rf_model)
importance_frame <- measure_importance(rf_model)
## [1] "Warning: your forest does not contain information on local importance so 'mse_increase' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."
importance_frame
## variable mean_min_depth no_of_nodes
## 1 abs_title_sentiment_polarity 6.05 3620
## 2 abs_title_subjectivity 6.75 3634
## 3 average_token_length 4.90 7158
## 4 avg_negative_polarity 5.00 6470
## 5 avg_positive_polarity 4.65 7071
## 6 data_channel_is_bus 9.75 576
## 7 data_channel_is_entertainment 3.75 598
## 8 data_channel_is_lifestyle 8.70 394
## 9 data_channel_is_socmed 4.65 358
## 10 data_channel_is_tech 4.15 553
## 11 data_channel_is_world 7.95 572
## 12 global_rate_negative_words 5.25 6308
## 13 global_rate_positive_words 4.40 6802
## 14 global_sentiment_polarity 4.55 6518
## 15 global_subjectivity 4.75 7115
## 16 is_weekend 2.85 627
## 17 kw_avg_avg 1.10 7724
## 18 kw_avg_max 4.20 7536
## 19 kw_avg_min 4.15 7274
## 20 kw_max_avg 2.15 7668
## 21 kw_max_max 4.85 1481
## 22 kw_max_min 4.55 6995
## 23 kw_min_avg 3.45 4675
## 24 kw_min_max 5.40 4350
## 25 kw_min_min 5.50 1566
## 26 LDA_00 4.85 7170
## 27 LDA_01 4.65 7150
## 28 LDA_02 3.60 6990
## 29 LDA_03 4.50 7120
## 30 LDA_04 3.80 7299
## 31 max_negative_polarity 6.10 4358
## 32 max_positive_polarity 6.45 3445
## 33 min_negative_polarity 6.15 4467
## 34 min_positive_polarity 5.50 3563
## 35 n_non_stop_unique_tokens 4.10 6839
## 36 n_non_stop_words 5.10 5939
## 37 n_tokens_content 4.55 6054
## 38 n_tokens_title 5.85 5192
## 39 n_unique_tokens 4.65 6498
## 40 num_hrefs 4.45 5820
## 41 num_imgs 4.70 3575
## 42 num_keywords 6.10 3421
## 43 num_self_hrefs 6.05 4132
## 44 num_videos 6.00 2363
## 45 rate_negative_words 5.30 5439
## 46 rate_positive_words 5.95 5502
## 47 self_reference_avg_sharess 2.05 5473
## 48 self_reference_max_shares 3.80 5391
## 49 self_reference_min_shares 2.50 5791
## 50 title_sentiment_polarity 6.00 3962
## 51 title_subjectivity 6.30 3942
## 52 weekday_is_friday 8.55 1032
## 53 weekday_is_monday 7.90 1091
## 54 weekday_is_saturday 6.65 371
## 55 weekday_is_sunday 6.20 424
## 56 weekday_is_thursday 8.55 1163
## 57 weekday_is_tuesday 8.40 1181
## 58 weekday_is_wednesday 7.80 1236
## node_purity_increase no_of_trees times_a_root p_value
## 1 361.96855 20 0 1.000000e+00
## 2 329.99511 20 0 1.000000e+00
## 3 942.55218 20 0 0.000000e+00
## 4 761.65110 20 0 4.518616e-206
## 5 865.78038 20 0 0.000000e+00
## 6 37.55146 20 0 1.000000e+00
## 7 227.15143 20 0 1.000000e+00
## 8 55.39074 20 0 1.000000e+00
## 9 146.19255 20 0 1.000000e+00
## 10 154.37855 20 0 1.000000e+00
## 11 77.88976 20 0 1.000000e+00
## 12 670.52556 20 0 4.676845e-178
## 13 829.69178 20 0 4.415091e-269
## 14 837.36505 20 0 1.012688e-214
## 15 949.03939 20 0 0.000000e+00
## 16 290.08876 20 0 1.000000e+00
## 17 2477.40845 20 9 0.000000e+00
## 18 1007.33576 20 0 0.000000e+00
## 19 940.40374 20 0 0.000000e+00
## 20 1639.72546 20 4 0.000000e+00
## 21 225.31631 20 0 1.000000e+00
## 22 827.72836 20 0 4.691301e-309
## 23 797.29637 20 1 7.824304e-08
## 24 495.96671 20 0 3.711963e-01
## 25 181.07964 20 0 1.000000e+00
## 26 942.14681 20 0 0.000000e+00
## 27 908.54537 20 0 0.000000e+00
## 28 933.08244 20 0 5.459439e-308
## 29 997.42790 20 2 0.000000e+00
## 30 1003.62482 20 0 0.000000e+00
## 31 411.41419 20 0 3.259535e-01
## 32 292.27921 20 0 1.000000e+00
## 33 364.56348 20 0 1.734615e-02
## 34 363.67230 20 0 1.000000e+00
## 35 967.56319 20 0 1.474929e-276
## 36 674.19418 20 0 3.142811e-121
## 37 696.98487 20 0 7.092232e-138
## 38 575.80919 20 0 4.935300e-38
## 39 846.62502 20 0 4.252466e-211
## 40 754.50557 20 0 4.808385e-105
## 41 500.14904 20 0 1.000000e+00
## 42 270.21904 20 0 1.000000e+00
## 43 358.97631 20 0 9.988032e-01
## 44 303.51098 20 0 1.000000e+00
## 45 536.07648 20 0 1.359999e-60
## 46 529.55093 20 0 3.777166e-67
## 47 1080.02788 20 3 4.315452e-64
## 48 718.99262 20 0 8.175748e-56
## 49 1089.12378 20 1 2.906891e-101
## 50 480.69846 20 0 1.000000e+00
## 51 397.31150 20 0 1.000000e+00
## 52 85.78724 20 0 1.000000e+00
## 53 105.96413 20 0 1.000000e+00
## 54 77.74983 20 0 1.000000e+00
## 55 75.00159 20 0 1.000000e+00
## 56 87.59704 20 0 1.000000e+00
## 57 79.46224 20 0 1.000000e+00
## 58 97.43598 20 0 1.000000e+00
#CONFUSION TABLE
yhat <- ifelse(phat3 > 0.5, 1, 0)
## 5. Build the confusion table.
length(test2$nwshares)
## [1] 14671
ct <- table(test2$nwshares, yhat[,2])
# This function take `ct` and make a better table
gct <- function(x) {
conf_table <- matrix(0, 2, 2)
conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
colnames(conf_table) <- c("Y=1", "Y=0")
rownames(conf_table) <- c("phat=1", "phat=0")
conf_table
}
cont <- gct(ct)
cont
## Y=1 Y=0
## phat=1 6430 2295
## phat=0 2857 3089
accuracy <- sum(diag(cont)) / sum(cont)
accuracy
## [1] 0.648831