爬虫代码范例-抓取PPT看板NBA，并存储为josn格式

2024年1月17日 20:47:37技术•随笔评论1字数 863阅读2分52秒阅读模式

import requests
from bs4 import BeautifulSoup
import json
url='https://www.ptt.cc/bbs/nba/index.html'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
response=requests.get(url,headers=headers)
soup=BeautifulSoup(response.text,'html.parser')
articles=soup.find_all("div",class_="r-ent")
data_list=[]
for a in articles:
    data={}
    title=a.find('div',class_='title')
    if title and title.a:
        title=title.a.text
    else:
        title='没有标题'
    data['标题']=title

    popular=a.find('div',class_='nrec')
    if popular and popular.span:
        popular=popular.span.text
    else:
        popular='N/A'
    data['人气'] = popular
    date=a.find('div',class_='date')
    if date:
        date=date.text
    else:
        date='N/A'
    data['日期'] = date
    data_list.append(data)
with open('ppt_nab_crawler.json','w',encoding='utf-8') as file:
    json.dump(data_list,file,ensure_ascii=False,indent=4)
print('资料已经存储为ppt_nab_crawler.json')

继续阅读