Skip to content

Commit 275e40d

Browse files
committed
提交代码
1 parent fbf22ef commit 275e40d

File tree

4 files changed

+184
-0
lines changed

4 files changed

+184
-0
lines changed

.DS_Store

0 Bytes
Binary file not shown.

xianhuan/.DS_Store

6 KB
Binary file not shown.

xianhuan/yanxuanbra/ana.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
@author: 闲欢
5+
"""
6+
import json
7+
import pandas as pd
8+
from pyecharts.charts import Bar, Pie
9+
from pyecharts import options as opts
10+
import jieba
11+
from PIL import Image
12+
from wordcloud import WordCloud
13+
from matplotlib import pyplot as plt
14+
import numpy as np
15+
from os import path
16+
17+
size = ['XXL', 'XL', 'XS', 'S', 'M', 'L']
18+
19+
color = []
20+
size1 = []
21+
size2 = []
22+
comments = []
23+
24+
with open("comments.txt", "r", encoding="utf-8") as f:
25+
for line in f:
26+
data_obj = json.loads(line)
27+
comments.append(data_obj['content'])
28+
skuinfo = data_obj['skuInfo']
29+
# skuArr = skuinfo.split(",")
30+
for sku in skuinfo:
31+
if '颜色' in sku and '内裤' not in sku:
32+
color.append(sku.replace("颜色:", "").strip().replace("开扣", "").replace("套头", "").replace("文胸", "").replace("套装", "").replace("(薄杯)", "").replace("(厚杯)", ""))
33+
elif '尺码' in sku:
34+
is_size1 = False
35+
for s in size:
36+
if s in sku:
37+
is_size1 = True
38+
size1.append(s)
39+
break
40+
41+
# 非SML这种定义尺寸的,就是简单罩杯定义的,同时去掉"适合75ABCD"这种定义的
42+
if not is_size1 and '适合' not in sku:
43+
size2.append(sku.replace('尺码:', ""))
44+
45+
# 颜色可视化
46+
df = pd.DataFrame(color, columns=['color'])
47+
analyse_color = df['color'].value_counts()
48+
49+
bar = Bar()
50+
bar.add_xaxis(analyse_color.index.values.tolist())
51+
bar.add_yaxis("", analyse_color.values.tolist())
52+
bar.set_global_opts(
53+
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-90)),
54+
title_opts=opts.TitleOpts(title="颜色分布"),
55+
# datazoom_opts=opts.DataZoomOpts(),
56+
)
57+
# bar.render_notebook()
58+
bar.render('color.html')
59+
60+
61+
# 尺码可视化
62+
df2 = pd.DataFrame(size1, columns=['size'])
63+
analyse_size = df2['size'].value_counts()
64+
65+
bar = Bar()
66+
bar.add_xaxis(analyse_size.index.values.tolist())
67+
bar.add_yaxis("", analyse_size.values.tolist())
68+
bar.set_global_opts(
69+
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)),
70+
title_opts=opts.TitleOpts(title="尺寸分布"),
71+
# datazoom_opts=opts.DataZoomOpts(),
72+
)
73+
bar.render('size1.html')
74+
75+
df2 = pd.DataFrame(size2, columns=['size'])
76+
analyse_size = df2['size'].value_counts()
77+
78+
bar = Bar()
79+
bar.add_xaxis(analyse_size.index.values.tolist())
80+
bar.add_yaxis("", analyse_size.values.tolist())
81+
bar.set_global_opts(
82+
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)),
83+
title_opts=opts.TitleOpts(title="尺寸分布"),
84+
# datazoom_opts=opts.DataZoomOpts(),
85+
)
86+
bar.render('size2.html')
87+
88+
89+
90+
# 评论可视化
91+
text = " ".join(comments)
92+
def gen_wc_split_text(text='There is no txt', max_words=None, background_color=None,
93+
font_path='/System/Library/Fonts/PingFang.ttc',
94+
output_path='', output_name='',
95+
mask_path=None, mask_name=None,
96+
width=400, height=200, max_font_size=100, axis='off'):
97+
all_seg = jieba.cut(text, cut_all=False)
98+
split_text = ' '
99+
for seg in all_seg:
100+
split_text = split_text + seg + ' '
101+
102+
# 设置一个底图
103+
mask = None
104+
if mask_path is not None:
105+
mask = np.array(Image.open(path.join(mask_path, mask_name)))
106+
107+
wordcloud = WordCloud(background_color=background_color,
108+
mask=mask,
109+
max_words=max_words,
110+
max_font_size=max_font_size,
111+
width=width,
112+
height=height,
113+
# 如果不设置中文字体,可能会出现乱码
114+
font_path=font_path)
115+
myword = wordcloud.generate(str(split_text))
116+
# 展示词云图
117+
plt.imshow(myword)
118+
plt.axis(axis)
119+
plt.show()
120+
121+
# 保存词云图
122+
wordcloud.to_file(path.join(output_path, output_name))
123+
124+
gen_wc_split_text(text, output_name='comments_wc.png', output_path='./')

xianhuan/yanxuanbra/bra.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
@author: 闲欢
5+
"""
6+
import requests
7+
import time
8+
import json
9+
10+
# 获取产品列表
11+
def search_keyword(keyword):
12+
uri = 'https://you.163.com/xhr/search/search.json'
13+
query = {
14+
"keyword": keyword,
15+
"page": 1
16+
}
17+
try:
18+
res = requests.get(uri, params=query).json()
19+
result = res['data']['directly']['searcherResult']['result']
20+
product_id = []
21+
for r in result:
22+
product_id.append(r['id'])
23+
return product_id
24+
except:
25+
raise
26+
27+
# 获取评论
28+
def details(product_id):
29+
url = 'https://you.163.com/xhr/comment/listByItemByTag.json'
30+
try:
31+
C_list = []
32+
for i in range(1, 100):
33+
query = {
34+
"itemId": product_id,
35+
"page": i,
36+
}
37+
res = requests.get(url, params=query).json()
38+
if not res['data']['commentList']:
39+
break
40+
print("爬取第 %s 页评论" % i)
41+
commentList = res['data']['commentList']
42+
C_list.extend(commentList)
43+
time.sleep(1)
44+
45+
return C_list
46+
except:
47+
raise
48+
49+
50+
product_id = search_keyword('文胸')
51+
r_list = []
52+
for p in product_id:
53+
r_list.extend(details(p))
54+
55+
with open('./comments.txt', 'w') as f:
56+
for r in r_list:
57+
try:
58+
f.write(json.dumps(r, ensure_ascii=False) + '\n')
59+
except:
60+
print('出错啦')

0 commit comments

Comments
 (0)