《战狼Ⅱ》破50亿了,你还不知道它在说啥?本文通过Python爬虫抓取获取12万条影评分析,告诉你《战狼Ⅱ》用什么撩到了你。
import
requests import re import pandas as pd url_first='https://movie.douban.com/subject/26363254/comments?start=0' head={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Ubuntu Chromium/59.0.3071.109 Chrome/59.0.3071.109 Safari/537.36'} html=requests.get(url_first,headers=head,cookies=cookies) cookies={'cookie':'你自己的cookie'} #也就是找到你的账号对应的cookie reg=re.compile(r'.*?.*?(.*?).*?(.*?).*?title="(.*?)">.*?title="(.*?)">.*?class=""> (.*?)\n',re.S) #评论等内容 while html.status_code==200: url_next='https://movie.douban.com/subject/26363254/comments'+re.findall(reg,html.text)[0] zhanlang=re.findall(ren,html.text) data=pd.DataFrame(zhanlang) data.to_csv('/home/wajuejiprince/文档/zhanlang/zhanlangpinglun.csv', header=False,index=False,mode='a+') #写入csv文件,'a+'是追加模式 data=[] zhanlang=[] html=requests.get(url_next,cookies=cookies,headers=head)
library(data.table)
library(plotly)
library(stringr)
library(jiebaR)
library(wordcloud2)
library(magrittr)
dt<-fread(file.choose()) #导入数据
dt[,c("V8","V9","V10","V11","V12","V13"):=NULL] #删除空列
#一条命令清洗数据
my_dt<-dt[str_detect(赞成评论数,"\\d+")][评论有用=='有用'][是否看过=="看过"][五星数%in%c("很差","较差","还行","推荐","力荐")]
wk <- worker()
sw<-function(x){wk<=x}
segwords<-lapply(my_dt[,评论内容],sw)
my_segwords<-unlist(segwords) #不要列表
#去除停止词
st<-readLines(file.choose()) #读取停止词
stopwords<-c(NULL)
for(i in 1:length(st))
{
stopwords[i]<-st[i]
}
seg_Words<-filter_segment(my_segwords,stopwords) #去除中文停止词
words<-table(seg_Words)%>%data.table()
setnames(words,"N","pinshu")
words[pinshu>1000] #去除较低频数的词汇(小于1000的)
wordcloud2(words[pinshu>1000], size = 2, fontFamily = "微软雅黑",color = "random-light", backgroundColor = "grey")