Skip to content

Commit ffed287

Browse files
committed
提交代码
1 parent 275e40d commit ffed287

File tree

3 files changed

+158
-0
lines changed

3 files changed

+158
-0
lines changed

xianhuan/.DS_Store

0 Bytes
Binary file not shown.
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
@author: 闲欢
5+
"""
6+
import json
7+
import pandas as pd
8+
from pyecharts.charts import Bar, Pie
9+
from pyecharts import options as opts
10+
import jieba
11+
from PIL import Image
12+
from wordcloud import WordCloud
13+
from matplotlib import pyplot as plt
14+
import numpy as np
15+
from os import path
16+
17+
color = []
18+
size = []
19+
comments = []
20+
21+
with open("briefs.txt", "r", encoding="utf-8") as f:
22+
for line in f:
23+
data_obj = json.loads(line)
24+
comments.append(data_obj['content'])
25+
skuinfo = data_obj['skuInfo']
26+
for sku in skuinfo:
27+
if '颜色' in sku and '规格' not in sku:
28+
filter_sku = sku.replace("颜色:", "").strip().replace("(", "").replace(")3条", "").replace("四条装", "").replace("*2", "").replace("2条", "").replace(")", "")
29+
color.extend(filter_sku.split('+'))
30+
elif '尺码' in sku and '~' not in sku:
31+
size.append(sku.replace('尺码:', ""))
32+
33+
# 颜色可视化
34+
df = pd.DataFrame(color, columns=['color'])
35+
analyse_color = df['color'].value_counts()
36+
37+
bar = Bar()
38+
bar.add_xaxis(analyse_color.index.values.tolist())
39+
bar.add_yaxis("", analyse_color.values.tolist())
40+
bar.set_global_opts(
41+
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-90)),
42+
title_opts=opts.TitleOpts(title="颜色分布"),
43+
# datazoom_opts=opts.DataZoomOpts(),
44+
)
45+
# bar.render_notebook()
46+
bar.render('briefs_color.html')
47+
48+
49+
# 尺码可视化
50+
df2 = pd.DataFrame(size, columns=['size'])
51+
analyse_size = df2['size'].value_counts()
52+
53+
bar = Bar()
54+
bar.add_xaxis(analyse_size.index.values.tolist())
55+
bar.add_yaxis("", analyse_size.values.tolist())
56+
bar.set_global_opts(
57+
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)),
58+
title_opts=opts.TitleOpts(title="尺寸分布"),
59+
# datazoom_opts=opts.DataZoomOpts(),
60+
)
61+
bar.render('briefs_size.html')
62+
63+
64+
# 评论可视化
65+
text = " ".join(comments)
66+
def gen_wc_split_text(text='There is no txt', max_words=None, background_color=None,
67+
font_path='/System/Library/Fonts/PingFang.ttc',
68+
output_path='', output_name='',
69+
mask_path=None, mask_name=None,
70+
width=400, height=200, max_font_size=100, axis='off'):
71+
all_seg = jieba.cut(text, cut_all=False)
72+
split_text = ' '
73+
for seg in all_seg:
74+
split_text = split_text + seg + ' '
75+
76+
# 设置一个底图
77+
mask = None
78+
if mask_path is not None:
79+
mask = np.array(Image.open(path.join(mask_path, mask_name)))
80+
81+
wordcloud = WordCloud(background_color=background_color,
82+
mask=mask,
83+
max_words=max_words,
84+
max_font_size=max_font_size,
85+
width=width,
86+
height=height,
87+
# 如果不设置中文字体,可能会出现乱码
88+
font_path=font_path)
89+
myword = wordcloud.generate(str(split_text))
90+
# 展示词云图
91+
plt.imshow(myword)
92+
plt.axis(axis)
93+
plt.show()
94+
95+
# 保存词云图
96+
wordcloud.to_file(path.join(output_path, output_name))
97+
98+
gen_wc_split_text(text, output_name='briefs_comments_wc.png', output_path='./')
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
@author: 闲欢
5+
"""
6+
import requests
7+
import time
8+
import json
9+
10+
# 获取商品列表
11+
def search_keyword(keyword):
12+
uri = 'https://you.163.com/xhr/search/search.json'
13+
query = {
14+
"keyword": keyword,
15+
"page": 1
16+
}
17+
try:
18+
res = requests.get(uri, params=query).json()
19+
result = res['data']['directly']['searcherResult']['result']
20+
product_id = []
21+
for r in result:
22+
product_id.append(r['id'])
23+
return product_id
24+
except:
25+
raise
26+
27+
# 获取评论
28+
def details(product_id):
29+
url = 'https://you.163.com/xhr/comment/listByItemByTag.json'
30+
try:
31+
C_list = []
32+
for i in range(1, 100):
33+
query = {
34+
"itemId": product_id,
35+
"page": i,
36+
}
37+
res = requests.get(url, params=query).json()
38+
if not res['data']['commentList']:
39+
break
40+
print("爬取第 %s 页评论" % i)
41+
commentList = res['data']['commentList']
42+
C_list.extend(commentList)
43+
time.sleep(1)
44+
45+
return C_list
46+
except:
47+
raise
48+
49+
50+
product_id = search_keyword('男士内裤')
51+
r_list = []
52+
for p in product_id:
53+
r_list.extend(details(p))
54+
55+
with open('./briefs.txt', 'w') as f:
56+
for r in r_list:
57+
try:
58+
f.write(json.dumps(r, ensure_ascii=False) + '\n')
59+
except:
60+
print('出错啦')

0 commit comments

Comments
 (0)