BBC English at Work 爬虫代码
目录
警告
本文最后更新于 2025-02-20,文中内容可能已过时。
Helped with claude.ai
import requests
from bs4 import BeautifulSoup
import html2markdown
import os
from urllib.parse import urljoin
def get_article_links(url):
"""获取所有文章链接"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 找到role=article的ul元素下的所有符合条件的链接
article_ul = soup.find('ul', attrs={'class': 'threecol'})
if not article_ul:
return []
links = []
for li in article_ul.find_all('li'):
img_link = li.find('div', class_='img').find('a')
if img_link:
full_url = urljoin(url, img_link['href'])
links.append(full_url)
return links
except Exception as e:
print(f"获取链接时出错: {e}")
return []
def get_article_content(url):
"""获取文章内容并转换为markdown"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 找到指定class的内容
content_div = soup.find('div', class_='widget widget-richtext 6').find("div",class_ = "text").children
content_div = [str(x) for x in content_div]
if not content_div:
return None
content_div = ''.join(content_div).replace("<br/></strong>",'</strong><br/>')
# 将HTML转换为markdown
markdown_content = html2markdown.convert(content_div)
return markdown_content
except Exception as e:
print(f"获取文章内容时出错: {e}")
return None
def save_markdown(content, url):
"""保存markdown文件"""
try:
# 获取URL的最后一部分作为文件名
filename = 'bbc-english-at-work-transcript-'+url.rstrip('/').split('/')[-1] + '.md'
title = url.rstrip('/').split('/')[-1]
content = f"original url: [{url.rstrip('/').split('/')[-1]}]({url})\n\n" + content
# 创建output目录(如果不存在)
if not os.path.exists('output'):
os.makedirs('output')
# 保存文件
with open(os.path.join('output', filename), 'w', encoding='utf-8') as f:
f.write(content)
add_hugo_post_frontmatter(os.path.join('output', filename), "BBC English at Work 完整台词 "+title,'2025-02-20T15:52:34+08:00', [""])
return True
except Exception as e:
print(f"保存文件时出错: {e}")
return False
def add_hugo_post_frontmatter(filename, title, date, tags):
"""在markdown文件中添加Hugo的frontmatter"""
try:
# 读取文件内容
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
# 创建frontmatter
frontmatter = f"""---
title: "{title}"
date: "{date}"
description: "BBC English at Work 完整台词。学习职场英语对话、地道表达和商务沟通技巧。"
categories:
- "职场英语"
- "英语学习"
tags:
- "BBC English at Work"
- "商务英语"
- "职场对话"
- "英语听力"
- "英语口语"
keywords:
- "BBC English at Work"
- "职场英语对话"
- "商务英语学习"
- "英语听力练习"
---
"""
# 将frontmatter添加到文件内容前
content = frontmatter + content
# 写入文件
with open(filename, 'w', encoding='utf-8') as f:
f.write(content)
return True
except Exception as e:
print(f"添加frontmatter时出错: {e}")
return False
def main():
base_url = 'https://www.bbc.com/learningenglish/english/features/english-at-work/'
# 获取所有文章链接
print("正在获取文章链接...")
article_links = get_article_links(base_url)
if not article_links:
print("没有找到任何文章链接")
return
print(f"找到 {len(article_links)} 个文章链接")
# 处理每个文章
for i, link in enumerate(article_links, 1):
print(f"\n处理第 {i} 个文章: {link}")
# 获取文章内容
content = get_article_content(link)
if not content:
print("无法获取文章内容,跳过")
continue
# 保存文章
if save_markdown(content, link):
print("文章保存成功")
else:
print("文章保存失败")
if __name__ == "__main__":
main()
如果你觉得这篇文章对你有所帮助,欢迎赞赏~
感谢您的支持

微信支付

支付宝