Skip to content

Commit 60c2db2

Browse files
committed
python showdata
1 parent 9e639b4 commit 60c2db2

25 files changed

+99952
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
# @Time : 2020/8/27 14:06
4+
# @Author : way
5+
# @Site :
6+
# @Describe: 通过 ip 获取所在省份
7+
8+
import sys
9+
import json
10+
import requests
11+
import os
12+
13+
ak = "<换成你的ak>" # 百度 ak 自行申请 http://lbsyun.baidu.com/index.php?title=webapi/ip-api
14+
15+
ipCache = {}
16+
if os.path.exists("ip_cache.txt"):
17+
with open("ip_cache.txt", "r") as f:
18+
data = f.readline()
19+
while data:
20+
ip, province = data.strip().split("\t")
21+
ipCache[ip] = province
22+
data = f.readline()
23+
24+
def ip2province(ip):
25+
province = ipCache.get(ip, None)
26+
if province is None:
27+
url = f"https://api.map.baidu.com/location/ip?ak={ak}&ip={ip}&coor=bd09ll"
28+
try:
29+
province = json.loads(requests.get(url).text)['address'].split('|')[1]
30+
ipCache[ip] = province
31+
# 这里就需要写入
32+
with open("ip_cache.txt","a") as f:
33+
f.write(ip + "\t" + province + "\n")
34+
return province
35+
except Exception as e:
36+
return "未知"
37+
else:
38+
return province
39+
40+
if __name__ == '__main__':
41+
for line in sys.stdin:
42+
cols = line.replace('\n', '').split('\t')
43+
print(cols)
44+
cols = [ip2province(cols[0]), cols[0]]
45+
sys.stdout.write('\t'.join(cols) + '\n')
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import re
2+
import os
3+
import datetime
4+
from baidu_api import ip2province
5+
import pandas as pd
6+
import openpyxl
7+
from openpyxl import load_workbook
8+
9+
# 命名分组
10+
obj = re.compile(r'(?P<ip>.*?)- - \[(?P<time>.*?)\] "(?P<request>.*?)" (?P<status>.*?) (?P<bytes>.*?) "(?P<referer>.*?)" "(?P<ua>.*?)"')
11+
def load_log(path):
12+
lst = []
13+
error_lst = []
14+
i = 0
15+
with open(path, mode="r", encoding="utf-8") as f:
16+
for line in f:
17+
line = line.strip()
18+
dic = parse(line)
19+
if dic: # 正确的数据添加到lst列表中
20+
lst.append(dic)
21+
else:
22+
error_lst.append(line) # 脏数据添加到error_lst列表中
23+
i += 1
24+
if i % 1000 == 0:
25+
print(i,"行")
26+
return lst, error_lst
27+
28+
def parse(line):
29+
# 解析单行nginx日志
30+
dic = {}
31+
try:
32+
# print(line)
33+
result = obj.match(line)
34+
# print(result.group("time"))
35+
# ip处理
36+
ip = result.group("ip")
37+
if ip.strip() == '-' or ip.strip() == "": # 如果是匹配到没有ip就把这条数据丢弃
38+
return False
39+
dic['ip'] = ip.split(",")[0].strip() # 如果有两个ip,取第一个ip
40+
dic['province'] = ip2province(dic['ip']) # 用 IP 转换为省份
41+
# print("dic['province']:",dic['province'])
42+
# 状态码处理
43+
status = result.group("status") # 状态码
44+
dic['status'] = status
45+
46+
# 时间处理
47+
time = result.group("time") # 21/Dec/2019:21:45:31 +0800
48+
time = time.replace(" +0800", "") # 替换+0800为空
49+
t = datetime.datetime.strptime(time, "%d/%b/%Y:%H:%M:%S") # 将时间格式化成友好的格式
50+
dic['time'] = t
51+
dic['hour'] = t.hour
52+
# request处理
53+
request = result.group("request")
54+
a = request.split()[1].split("?")[0] # 往往url后面会有一些参数,url和参数之间用?分隔,取出不带参数的url
55+
dic['request'] = a
56+
57+
# user_agent处理
58+
ua = result.group("ua")
59+
if "Windows NT" in ua:
60+
u = "windows"
61+
elif "iPad" in ua:
62+
u = "ipad"
63+
elif "Android" in ua:
64+
u = "android"
65+
elif "Macintosh" in ua:
66+
u = "mac"
67+
elif "iPhone" in ua:
68+
u = "iphone"
69+
else:
70+
u = "其他设备"
71+
dic['ua'] = u
72+
73+
# refer处理
74+
referer = result.group("referer")
75+
dic['referer'] = referer
76+
77+
return dic
78+
except Exception as e:
79+
print("[parse]",line, "-->", e)
80+
return None
81+
82+
def analyse(lst, datafile):
83+
df = pd.DataFrame(lst) # 创建 DataFrame
84+
85+
# 统计省份
86+
province_count_df = pd.value_counts(df['province']).reset_index().rename(columns={"index": "province", "province": "count"})
87+
88+
# 统计时段
89+
hour_count_df = pd.value_counts(df['hour']).reset_index().rename(columns={"index": "hour", "hour": "count"}).sort_values(by='hour')
90+
91+
# 统计客户端
92+
ua_count_df = pd.value_counts(df['ua']).reset_index().rename(columns={"index": "ua", "ua": "count"})
93+
94+
# 数据存储
95+
to_excel(province_count_df, datafile, sheet_name='省份')
96+
to_excel(hour_count_df, datafile, sheet_name='按时')
97+
to_excel(ua_count_df, datafile, sheet_name='客户端')
98+
99+
def to_excel(dataframe, filepath, sheet_name):
100+
if os.path.exists(filepath):
101+
excelWriter = pd.ExcelWriter(filepath, engine='openpyxl')
102+
book = load_workbook(excelWriter.path)
103+
excelWriter.book = book
104+
dataframe.to_excel(excel_writer=excelWriter,sheet_name=sheet_name,index=None, header=None)
105+
excelWriter.close()
106+
else:
107+
dataframe.to_excel(filepath, sheet_name=sheet_name, index=None, header=None)
108+
109+
if __name__ == '__main__':
110+
lst, error_lst = load_log("nginx_access.log")
111+
analyse(lst, "data.xlsx")

taiyangxue/showdata/ip_cache.txt

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
124.64.19.27 北京
2+
124.64.18.118 北京
3+
114.246.34.133 北京
4+
123.119.247.136 北京
5+
221.219.132.144 北京
6+
116.2.39.176 辽宁
7+
36.104.125.224 吉林
8+
124.64.16.10 北京
9+
124.64.19.17 北京
10+
124.64.19.198 北京
11+
61.181.218.54 天津
12+
124.64.17.231 北京
13+
218.69.54.122 天津
14+
124.64.17.228 北京
15+
221.192.179.8 河北
16+
211.94.246.253 天津
17+
211.94.239.98 天津
18+
114.242.249.80 北京
19+
114.242.250.129 北京
20+
101.91.60.81 江苏
21+
221.192.179.30 河北
22+
43.249.136.26 天津
23+
124.64.16.188 北京
24+
114.242.248.31 北京
25+
124.64.18.198 北京
26+
218.68.91.101 辽宁
27+
61.148.245.90 北京
28+
117.10.206.88 天津
29+
203.208.60.64 福建
30+
203.208.60.3 福建
31+
203.208.60.60 福建
32+
124.64.19.57 北京
33+
123.151.77.91 河北
34+
124.64.18.144 北京
35+
123.151.76.158 天津
36+
61.148.243.176 北京
37+
124.64.17.67 北京
38+
223.104.3.204 北京
39+
124.64.16.93 北京
40+
211.94.208.121 天津
41+
211.94.246.15 天津
42+
124.64.16.138 北京
43+
61.148.243.38 北京
44+
120.244.52.54 北京
45+
124.64.16.136 北京
46+
124.64.18.217 北京
47+
120.245.4.41 北京
48+
211.94.195.158 天津
49+
124.64.16.135 北京
50+
123.151.76.248 天津
51+
61.148.244.12 北京
52+
124.64.16.253 北京
53+
218.68.91.112 辽宁
54+
124.64.18.76 北京
55+
211.94.238.171 天津
56+
125.39.46.56 辽宁
57+
111.196.106.59 北京
58+
114.241.45.159 北京
59+
211.94.225.8 天津
60+
123.150.174.182 天津
61+
61.181.236.214 天津
62+
124.64.16.98 北京
63+
61.148.243.25 北京
64+
124.64.17.42 北京
65+
123.151.77.81 河北
66+
211.94.230.118 天津
67+
103.3.96.2 天津
68+
114.242.250.155 北京
69+
114.240.67.63 北京
70+
124.64.16.221 北京
71+
211.94.252.135 天津
72+
124.64.16.62 北京
73+
218.69.61.133 天津
74+
124.64.19.236 北京
75+
114.242.248.188 北京
76+
117.10.207.38 天津
77+
124.64.16.19 北京
78+
211.94.251.187 天津
79+
139.214.251.167 吉林
80+
123.151.77.70 河北
81+
111.30.142.186 河北
82+
111.30.142.227 河北
83+
223.104.236.216 辽宁
84+
111.30.142.78 河北
85+
223.104.3.186 北京
86+
180.97.118.219 江苏
87+
124.64.18.192 北京
88+
122.115.226.173 北京
89+
220.194.106.92 天津
90+
220.194.106.94 北京
91+
202.99.89.162 天津
92+
124.64.19.220 北京
93+
124.64.18.4 北京
94+
61.148.245.99 北京
95+
223.104.3.198 北京
96+
211.94.239.184 天津
97+
125.39.132.94 北京
98+
211.94.225.128 天津
99+
61.151.207.158 上海
100+
117.136.38.145 北京
101+
223.104.227.204 天津
102+
103.3.96.18 天津
103+
113.96.232.118 重庆
104+
223.104.3.11 北京
105+
211.94.239.180 天津
106+
139.214.246.116 吉林
107+
211.94.237.208 天津
108+
101.89.239.230 上海
109+
202.99.112.190 天津
110+
124.64.17.172 北京
111+
124.64.19.5 北京
112+
61.148.243.117 北京
113+
221.192.179.120 河北
114+
122.97.175.103 江苏
115+
202.99.113.50 天津
116+
218.67.234.74 天津
117+
58.218.133.250 江苏
118+
122.97.175.148 江苏
119+
112.224.67.28 山东
120+
61.151.207.252 江苏
121+
61.181.218.93 天津
122+
117.10.206.177 天津
123+
223.104.236.240 辽宁
124+
123.151.77.123 河北
125+
220.194.107.221 北京
126+
110.251.15.159 河北
127+
175.24.45.114 上海
128+
124.64.19.86 北京
129+
220.194.107.222 北京
130+
221.192.178.44 河北
131+
114.242.248.155 北京
132+
223.104.175.237 辽宁
133+
223.104.175.86 辽宁
134+
211.94.240.149 天津
135+
223.104.176.106 辽宁
136+
221.192.179.167 河北
137+
124.64.17.217 北京
138+
222.186.136.164 江苏
139+
124.64.18.245 北京
140+
211.94.208.8 天津
141+
223.104.176.23 辽宁
142+
122.97.175.145 江苏
143+
211.94.253.31 天津
144+
36.104.39.237 吉林
145+
221.192.179.96 河北
146+
218.69.52.34 天津
147+
211.94.254.22 天津
148+
223.104.103.17 河北
149+
61.148.243.204 北京
150+
124.64.17.54 北京
151+
139.214.251.83 吉林
152+
139.214.244.217 吉林
153+
124.64.19.162 北京
154+
117.136.54.52 天津
155+
220.181.108.101 广东
156+
220.181.108.171 广东
157+
111.206.221.22 北京
158+
111.206.221.45 北京
159+
111.206.221.11 北京
160+
111.206.198.26 北京
161+
111.206.221.43 北京
162+
111.206.198.101 北京
163+
111.206.221.108 北京
164+
117.10.206.50 天津
165+
103.3.96.166 天津
166+
61.181.219.241 天津
167+
124.64.19.43 北京
168+
103.3.97.8 天津
169+
61.148.243.124 北京
170+
124.64.19.186 北京
171+
221.192.179.34 河北
172+
36.104.122.38 吉林
173+
221.192.180.153 河北
174+
211.94.245.65 天津
175+
221.192.178.240 河北
176+
220.181.108.177 广东
177+
220.181.108.161 广东
178+
111.206.198.50 北京
179+
111.206.198.119 北京
180+
111.206.198.70 北京
181+
111.206.198.6 北京
182+
111.206.221.102 北京
183+
111.206.221.44 北京
184+
111.206.198.41 北京
185+
36.98.226.192 河北
186+
124.64.19.93 北京

0 commit comments

Comments
 (0)