
September 28, 2018
bit.ly/0928pttR

減輕抓取 PTT 資料的負擔
符合 PTT 的斷詞處理
與 R Text Mining 套件銜接
devtools::install_github("liao961120/pttR", ref = "build")
library(dplyr)
# 資料抓取
idx_df <- pttR::index2df("gossiping", newest = 1)
pst_df <- idx_df$link[1:5] %>%
pttR::as_url() %>%
pttR::post2df()
# 斷詞
pst_df_segged <- pst_df %>%
mutate(content = pttR::seg_content(content),
comment = pttR::seg_comment(comment))
# 第一篇文章的留言
pst_df_segged$comment[[1]]
# Construct Corpus Object
post_qcorp <- pttR::post2qcorp(pst_df_segged) # Corpus object
cmt_qcorp <- pttR::comment2qcorp(pst_df_segged) # Corpus list-col in df
index2df() \(\equiv\) www.ptt.cc/bbs/看板名稱/index.html post2df() \(\equiv\) www.ptt.cc/bbs/看板名稱/xx..xx.htmlDemo
dplyr::mutate() + pttR::seg_content() + pttR::seg_comment()
(Demo)