爬虫练习

from bs4 import BeautifulSoup
import requests
import os
import shutil

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Connection": "close",
    "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1",
    "Referer": "http://www.infoq.com",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"
}

# url = 'https://www.infoq.com/presentations/'

def download_jpg(imageurl,image_location_path):
    resopnse = requests.get(imageurl,stream=True)
    if resopnse.status_code == 200:
        with open(image_location_path,'wb') as f:
            resopnse.raw.deconde_content = True
            shutil.copyfileobj(resopnse.raw,f)


def craw3(url):
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.text,'lxml')
    isExists = os.path.exists('./download_pic')
    if not isExists:
        os.mkdir('./download_pic')
    for pic_href in soup.find_all('div',class_ = 'card__content'):
        print(pic_href.find_all('img'))
        for pic in pic_href.find_all('img'):
            imgurl = pic.get('src')
            dir = os.path.abspath('./download_pic')
            filename = os.path.basename(imgurl)
            imgpath = os.path.join(dir,filename)
            print('开始下载 %s' %imgurl)
            download_jpg(imgurl,imgpath)

for i in range(12, 37, 12):
    url = 'http://www.infoq.com/cn/presentations' + str(i)
    craw3(url)
此条目发表在Python分类目录。将固定链接加入收藏夹。

发表评论

邮箱地址不会被公开。 必填项已用*标注

To create code blocks or other preformatted text, indent by four spaces:

    This will be displayed in a monospaced font. The first four 
    spaces will be stripped off, but all other whitespace
    will be preserved.
    
    Markdown is turned off in code blocks:
     [This is not a link](http://example.com)

To create not a block, but an inline code span, use backticks:

Here is some inline `code`.

For more help see http://daringfireball.net/projects/markdown/syntax

Protected with IP Blacklist CloudIP Blacklist Cloud