1 Prepare Data

Key Takeaways:

  1. File reut2-017.sgm is not UTF-8 and needs to be saved as UTF-8 for life to be easy.
  2. This legacy .sgm file extension which is essentially poorly-formed .xml picked fights with every parser that it encountered, even Beautiful Soup.
  3. Reading the corpus and parsing the tags manually turned out to be the most efficient solution.
  4. Two of the fields (out of 21,578) required re-formatting: 5-APR-1987 01:53:30 and 31-MAR-1987 605:12:1.

These 21,578 documents are readily available in an R document corpus in the package tm.corpus.Reuters21578. However, this corpus comes pre-loaded with many missing headings and mostly missing dates, so we read the files line-by-line to extract the date, text (with heading), and the value of lewissplit in case we wanted to perform some predictive modeling. This ended up being really annoying as the result of non-UTF-8 encoding on file 17 - but once that was fixed, we got what we needed.

Beautiful Soup was no help here.

# install.packages("tm.corpus.Reuters21578", repos = "http://datacube.wu.ac.at")
# install.packages("SnowballC")
# install.packages("textreg")
# install.packages('tm.corpus.Reuters21578')
# install.packages('text2vec')
# install.packages('Matrix')
# install.packages('umap')
# install.packages('tm')
# install.packages('slam')
# install.packages('irlba')
# install.packages('dbscan')
# install.packages('plotly')
# install.packages('gridExtra')
# install.packages('lubridate')
# install.packages('maxmatching')
# install.packages('plyr')
# install.packages('rARPACK')
# install.packages('textrank')
# install.packages('rvest')
# install.packages('tidytext')
# install.packages('tsne')

# ##################################################################
library(textrank)
library(rvest)
library(tidytext)
library(dbscan)
library(ggplot2)
library(irlba)
library(uwot)
library(slam)
library(text2vec)
#library(tm.corpus.Reuters21578)
library(SnowballC)
library(tm)
library(textreg)
library(stringr)
library(Matrix)
library(plotly)
library(gridExtra)
library(lubridate)
library(rARPACK)
library(htmlwidgets)
library(bookdown)
library(stringr)
library(fs)
library(tsne)
#setwd('/Users/shaina/Library/Mobile Documents/com~apple~Clouddocs/final_data_plots/Datasets and Code/reuters21578/')
# 
# PATH = '/Users/shaina/Library/Mobile Documents/com~apple~Clouddocs/Datasets and Code/reuters21578/Files/'
# filenames=dir_ls(PATH)
# datetime=vector()
# text = vector()
# lewissplit=vector()
# 
# 
# for(file in filenames){
#   con = file(file,"r", encoding = "UTF-8")
#   line = readLines(con, encoding="UTF-8")
#   line = paste(line, sep=' ', collapse = ' ')
#   # String size fixed
#   lewis_idx=str_locate_all(pattern='LEWISSPLIT=',line)
#   date_idx=str_locate_all(pattern='<DATE>',line)
#   for(i in 1:nrow(lewis_idx[[1]])){
#     datetime[length(datetime)+1]=substr(line, date_idx[[1]][i,2]+1,date_idx[[1]][i,2]+20)
#     lewissplit[length(lewissplit)+1]=substr(line, lewis_idx[[1]][i,2]+2,lewis_idx[[1]][i,2]+5)
#   }
#   # String size not fixed
#   line = paste(line, sep=' ', collapse = ' ')
#   line=gsub('</','<',line)
#   text_idx=str_locate_all(pattern='<TEXT',line)
#   for(i in seq(1,nrow(text_idx[[1]])-1,2)){
#     text[length(text)+1]=substr(line, text_idx[[1]][i,2]+2,text_idx[[1]][i+1,1]-1)
#   }
#   close(con)
# }
# 
# # Annoying that I end up with fewer documents than I should. Investigate that.
# 
# # Explore why I end up with fewer documents than I should by counting occurrences of <TEXT on each file: <aha> # Fixed UTF-8 encoding on file 17 and now it works.
# date = as.POSIXct(datetime,format = '%d-%b-%Y %H:%M:%S')
# datetime[which(is.na(date))]
# datetime[which(is.na(date))] = c("5-APR-1987 01:53:30", "31-MAR-1987 05:12:1")
# datetime = as.POSIXct(datetime,format = '%d-%b-%Y %H:%M:%S')
# lewissplit[lewissplit=='TRAI']='train'
# lewissplit[lewissplit=='TEST']='test'
# #
# # ############################################################################
# # # Now I can pull out the heading...
# # ############################################################################
# title_idx = str_locate_all(pattern='<TITLE>',text)
# head=vector()
# for(i in 1:(length(text))){
#   if(nrow(title_idx[[i]])<2){
#     head[i]=''
#   }else{
#       head[i]=substr(text[i], title_idx[[i]][1,2]+1,title_idx[[i]][2,1]-1)
#     }
# }
# head=gsub('&lt;','<',head, fixed=T)
# # ############################################################################
# # # ...and the raw article text.
# # ############################################################################
# body_idx = str_locate_all(pattern='<BODY>',text)
# body=vector()
# for(i in 1:(length(text))){
#   if(nrow(body_idx[[i]])<2){
#     body[i]=text[i]
#   }else{
#       body[i]=substr(text[i], body_idx[[i]][1,2]+1,body_idx[[i]][2,1]-1)
#     }
# }
# # Clean up Briefs
# body = gsub("TYPE=\"BRIEF\">&#2; ******<TITLE>",'',body, fixed=T)
# body = gsub("<TITLE>Blah blah blah. &#3;", '',body, fixed=T)
# 
# 
#  text=body
#  Reuters <- Corpus(VectorSource(text))
# save(text,head,Reuters,lewissplit,datetime,file='docs/final_data_plots/RawDataRead.RData')
 load('docs/final_data_plots/RawDataRead.RData')
  1. Make lower case, emove stop words + “Reuters”, punctuation, and numbers; Employ stemming.
  2. Create binary term-document matrix to remove terms occurring in less than 5 documents.
  3. Remove documents left with fewer than 10 words remaining.
  4. Subset the datetime, topics, and heading information accordingly.
# load('Reuters.RData')

# # 1
# ############################################################
# R = Reuters
# R = tm_map(R,content_transformer(tolower))
# R = tm_map(R,removeWords,stopwords("en"))
# R = tm_map(R,removePunctuation)
# R = tm_map(R,removeNumbers)
# R = tm_map(R,stemDocument)
# R = tm_map(R,removeWords, c('reuter', 'dlrs', 'mln', 'said','will', 'year', 'compani','pct','corp' ))
# # ############################################################
# # 2
# # ############################################################
# tdm = TermDocumentMatrix(R)
# binary = weightBin(tdm)
# keep_terms = row_sums(binary)>=5
# tdm = tdm[keep_terms,]
# # ############################################################
# # 3
# # ############################################################
# keep_docs = col_sums(tdm)>10
# R = R[keep_docs]
# tdm = tdm[,keep_docs ]
# dim(tdm)
# length(R)
# # ############################################################
# # 4
# # ############################################################
# datetime = datetime[keep_docs]
# lewissplit=lewissplit[keep_docs]
# head=head[keep_docs]
# raw_text=text[keep_docs]
# #############################################################
# # add breaks for text wrapping
# #############################################################
# raw_text = gsub("(.{60,}?)\\s", "\\1<br>", raw_text)
# # ############################################################
# # Save data to avoid repeat processing
# # ############################################################
# save(raw_text,head,lewissplit,tdm,R,datetime, file='docs/final_data_plots/processedV2.RData')
load('docs/final_data_plots/processedV2.RData')