基于Python的实时爬虫每小时PM2.5等污染物数据
01 | # coding:utf-8 |
02 | import threading |
03 | import urllib |
04 | import re,sys |
05 | import time |
06 | import hashlib |
07 | import os |
08 | |
09 | |
10 | sys.setdefaultencoding = "utf-8" |
11 | |
12 | |
13 | def fetchdata(city): |
14 | print city |
15 | md5 = "" |
16 | while True : |
17 | temp = "http://www.pm25.in/" + city #爬虫的站为:www.pm25.in,只要之前IP没有被该网站封了,就可以爬,假如被封了请申请API |
18 | url = urllib.urlopen(temp) |
19 | text = url.read() |
20 | |
21 | |
22 | shuju = re.findall( "<td>(.*?)</td>" ,text,re.S) #正则pm2.5等污染物数据 |
23 | data_time = re.findall( "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" ,text,re.S) #正则寻找当前时间 例如,2016-04-13 20:10:00 |
24 | |
25 | |
26 | md52 = hashlib.md5() |
27 | md52.update(data_time[ 0 ]) |
28 | |
29 | if md52.hexdigest() = = md5: |
30 | time.sleep( 3600 ) #自动休眠,每一小时爬一次数据 |
31 |
|