#python 3.6
import mysql.connector
import re
import urllib
conn = mysql.connector.connect(user='root', password='root',host = 'localhost',database = 'test')
cursor = conn.cursor()
def getHtml(url):
req = urllib.request.urlopen(url).read()
html = req.decode('utf-8')
return html
def getData(html,leibie):
reg = re.compile('(.*?)筹款进度
',re.S)
xiangmu = re.findall(reg,html)
# print(xiangmu)
shengfen = ['河北','山西','辽宁','吉林','黑龙江','江苏','浙江','安徽','福建',
'江西','山东','河南','湖北','湖南','广东','海南','四川','贵州','云南',
'陕西','甘肃','青海','台湾','内蒙古','广西','西藏','宁夏','新疆','香港','澳门']
zhixiashi = ['北京','天津','上海','重庆']
for x in range(len(xiangmu)):
name = re.findall('class="siteCardICH3" title="(.*.)" target="_blank"',xiangmu[x])
# print(name)
yichouzhichijindu = re.findall('(.*.)
',xiangmu[x])
label = re.findall('site_ALink siteIlB_item" target="_blank">(.*)',xiangmu[x])
index = 0
while 1:
if label[index] in shengfen:
province = label[index]
city = label[index+1]
index += 1
break
elif label[index] in zhixiashi:
province = label[index]
city = ''
index += 1
break
else:
index += 1
name = name[0].replace("'","“")
cursor.execute("""insert into test(项目名称,已筹款,支持数,筹款进度,省份or直辖市,市,类别)
values('%s','%s','%s','%s','%s','%s','%s')"""
%(name,yichouzhichijindu[0][1:-1],yichouzhichijindu[1],yichouzhichijindu[2],province,city,leibie))
conn.commit()
def endPage(html):
temp = re.findall('normalPage">(.*)',html)
endpage = int(temp[-1])
return endpage
def main():
print('begin')
for i in [1,2,3,4,5]:
urleibie = 'http://www.zhongchou.com/browse/id-28-tid-4'+str(i)+'-sm-p'
types = {1:'生物科技',2:'果蔬种植',3:'生态养殖',4:'茶酒饮品',5:'休闲零食'}
leibie = types[i]
# print('%s:'%leibie)
ii = 1
while 1:
url = urleibie + str(ii)
html = getHtml(url)
getData(html,leibie)
endpage = endPage(html)
print('page %s has finished'%ii)
ii += 1
if ii > endpage: break
print('type "%s" has finished'%leibie)
cursor.close()
conn.close()
print('all finished')
main()
大创项目做得一个关于融资达成率的东东,就自学了下爬虫,这个版本使用mysql数据库存取