python-下载固定百度图片地址
目标地址:https://tieba.baidu.com/p/3003913493?fr=ala0&pstaala=1&tpl=5&isgod=0
目标内容:view-source:https://tieba.baidu.com/p/3003913493?fr=ala0&pstaala=1&tpl=5&isgod=0
在kali linux运行脚本,自动保存文件到脚本的目录下
参考:http://blog.csdn.net/zywvvd/article/details/71123776
参考:http://www.cnblogs.com/fnng/p/3576154.html
root@kali:~/python/csvtpy/downloadtest# cat downloadjpg.py
#coding=utf-8
import urllib
import re
import sys
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1
return imglist
html = getHtml("https://tieba.baidu.com/p/3003913493?fr=ala0&pstaala=1&tpl=5&isgod=0")
print getImg(html)
root@kali:~/python# python download-re.py
['https://imgsa.baidu.com/forum/w%3D580/sign=f21e206f7f1ed21b79c92eed9d6fddae/be2f070828381f309cae3c34ab014c086f06f0fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=d248f9b3d01b0ef46ce89856edc551a1/270828381f30e924ce132dac4e086e061c95f7fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=0a9456b60cf41bd5da53e8fc61db81a0/08381f30e924b899de8bc8a56c061d950b7bf6fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=aa918a5919d5ad6eaaf964e2b1ca39a3/3f30e924b899a9013482eaab1f950a7b0308f5fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=777e9f78af6eddc426e7b4f309dab6a2/c924b899a9014c08178c9938087b02087af4f4fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=615f29c3dfc451daf6f60ce386fc52a5/9899a9014c086e06671f8ed600087bf40bd1cbfc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=d6e4596953da81cb4ee683c56267d0a4/89014c086e061d9571f186a579f40ad163d9cafc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=a14ed57783cb39dbc1c0675ee01709a7/6c086e061d950a7b7e82ff5908d162d9f3d3c9fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=2c5005663bdbb6fd255be52e3925aba6/4e061d950a7b0208067e8e7c60d9f2d3562cc8fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=ff41bd76b4fd5266a72b3c1c9b199799/3d950a7b02087bf4745be674f0d3572c10dfcffc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=9ebe54adb8a1cd1105b672288913c8b0/931bb051f819861850185f1948ed2e738ad4e6d4.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=6848c6bb6f224f4a5799731b39f69044/013fb80e7bec54e7ec62aa83bb389b504ec26aa5.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=ec69bf3e4510b912bfc1f6f6f3fcfcb5/a618367adab44aed0f0b4661b11c8701a08bfbea.jpg']
root@kali:~/python# python download-re.py
['https://imgsa.baidu.com/forum/w%3D580/sign=f21e206f7f1ed21b79c92eed9d6fddae/be2f070828381f309cae3c34ab014c086f06f0fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=d248f9b3d01b0ef46ce89856edc551a1/270828381f30e924ce132dac4e086e061c95f7fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=0a9456b60cf41bd5da53e8fc61db81a0/08381f30e924b899de8bc8a56c061d950b7bf6fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=aa918a5919d5ad6eaaf964e2b1ca39a3/3f30e924b899a9013482eaab1f950a7b0308f5fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=777e9f78af6eddc426e7b4f309dab6a2/c924b899a9014c08178c9938087b02087af4f4fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=615f29c3dfc451daf6f60ce386fc52a5/9899a9014c086e06671f8ed600087bf40bd1cbfc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=d6e4596953da81cb4ee683c56267d0a4/89014c086e061d9571f186a579f40ad163d9cafc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=a14ed57783cb39dbc1c0675ee01709a7/6c086e061d950a7b7e82ff5908d162d9f3d3c9fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=2c5005663bdbb6fd255be52e3925aba6/4e061d950a7b0208067e8e7c60d9f2d3562cc8fc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=ff41bd76b4fd5266a72b3c1c9b199799/3d950a7b02087bf4745be674f0d3572c10dfcffc.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=9ebe54adb8a1cd1105b672288913c8b0/931bb051f819861850185f1948ed2e738ad4e6d4.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=6848c6bb6f224f4a5799731b39f69044/013fb80e7bec54e7ec62aa83bb389b504ec26aa5.jpg', 'https://imgsa.baidu.com/forum/w%3D580/sign=ec69bf3e4510b912bfc1f6f6f3fcfcb5/a618367adab44aed0f0b4661b11c8701a08bfbea.jpg']
root@kali:~/python#
参考:https://zhidao.baidu.com/question/1672167856778968787.html
src="(.+?\.jpg)" pic_ext
解释:
src=" #匹配src="
(.+?\.jpg)
# 括号表示分组,将括号的内容捕获到分组当中
# .+表示匹配至少一个任意字符,问号?表示懒惰匹配,也就是匹配尽可能少的字符串。
# .+?\.jpg合起来表示尽可能少匹配字符的匹配到.jpg,避免匹配范围超出src的范围
# 这个括号也就可以匹配网页中图片的url了
" pic_ext #匹配" pic_ext