大叔资源备忘录

爬虫代码范例-抓取网页内容,并下载图片

import requests
from bs4 import BeautifulSoup
import os
def download_img(url,save_path):
    print(f'正在下载图片……{url}')
    response = requests.get(url)
    with open(save_path,'wb') as f:
        f.write(response.content)
    print('-'*30)
def main():
    url = 'https://www.ptt.cc/bbs/Beauty/M.1686997472.A.FDA.html'
    headers = {"Cookie": "over18=1"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup.prettify())
    spans=soup.find_all('span', class_='article-meta-value')
    titles=spans[2].text
    dir_name=f'images/{titles}'
    os.makedirs(dir_name, exist_ok=True)
    #找出网页中所有的图片
    links=soup.find_all('a')
    allow_file_name=["jpg","jpeg","png","gif"]
    for link in links:
        href=link.get('href')
        if not href:
            continue
        file_name=href.split('/')[-1]
        extension=href.split('.')[-1].lower()
        if extension in allow_file_name:
            print(f"图片类型:{extension} ")
            print(f'url:{href}')
            download_img(href,f'{dir_name}/{file_name}  ')
        # print(extension)

        # print(href)

if __name__ == '__main__':
    main()

这个程序下载下来的图片是不能打开,其实还没有找到真正的图片地址,后期有时候再修改代码。

退出移动版