爬取国家统计局地区数据脚本
import requests
#from lxml import etree
from pyquery import PyQuery as pq
import csv
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
}
filePath = './location.csv'
num = 0
def getContent( url):
ret = requests.get(url)
ret.content.decode('gbk', 'ignore')
return pq(ret.content);
def create_csv(path,csv_head):
with open(path,'a+', encoding='UTF-8') as f:
csv_write = csv.writer(f)
#csv_head = ["good","bad"]
csv_write.writerow(csv_head)
def write_csv(path, data_row):
with open(path,'a+', encoding='UTF-8') as f:
csv_write = csv.writer(f)
#data_row = ["1","2"]
csv_write.writerow(data_row)
# 创建文件 并写入字段描述
create_csv(filePath, ["level", "code", "name", "p_code"])
doc = getContent(url)
tdas = doc('.provincetr > td > a').items()
for a in tdas:
num +=1
tmpUrl = a.attr.href
shengId = tmpUrl.split('.',1)[0];
tmps = (1, shengId, a.text(), 0)
#print(tmps)
write_csv(filePath, tmps)
# 处理市级逻辑
shiList = getContent(baseUrl + tmpUrl)
trs = shiList('.citytr').items()
for shi in trs:
#tmpstr = shi.find('a');
tmpUrlShi = shi.find('a').attr.href;
shiId = shi('a:eq(0)').text()[0:4]
tmpshi = (2, shiId, shi('a:eq(1)').text(), shengId)
#print(tmpshi)
write_csv(filePath, tmpshi)
# 处理县级、区逻辑
xianList = getContent(baseUrl + tmpUrlShi);
xians = xianList('.countytr').items();
for xian in xians:
tmpUrlXian = xian.find('a').attr.href;
if tmpUrlXian is None:
continue
xianId = xian('a:eq(0)').text()[0:5]
tmpXian = (3, xianId, xian('a:eq(1)').text(), shiId)
#print(tmpXian)
write_csv(filePath, tmpXian)
# 处理街道级别
jieList = getContent(baseUrl + shengId + '/' + tmpUrlXian)
jies = jieList('.towntr').items();
for jie in jies:
tmpUrljie = jie.find('a').attr.href;
jieId = jie('a:eq(0)').text()[0:8]
tmpJie = (4, jieId, jie('a:eq(1)').text(), xianId)
#print(tmpJie)
write_csv(filePath, tmpJie)
# if num == 1:
# exit();
ps:请自行安装所需库
本作品采用 知识共享署名-相同方式共享 4.0 国际许可协议 进行许可。