目录

BBC English at Work 爬虫代码

目录
警告
本文最后更新于 2025-02-20,文中内容可能已过时。

Helped with claude.ai

import requests
from bs4 import BeautifulSoup
import html2markdown
import os
from urllib.parse import urljoin

def get_article_links(url):
    """获取所有文章链接"""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 找到role=article的ul元素下的所有符合条件的链接
        article_ul = soup.find('ul', attrs={'class': 'threecol'})
        if not article_ul:
            return []
            
        links = []
        for li in article_ul.find_all('li'):
            img_link = li.find('div', class_='img').find('a')
            if img_link:
                full_url = urljoin(url, img_link['href'])
                links.append(full_url)
        
        return links
    except Exception as e:
        print(f"获取链接时出错: {e}")
        return []

def get_article_content(url):
    """获取文章内容并转换为markdown"""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 找到指定class的内容
        content_div = soup.find('div', class_='widget widget-richtext 6').find("div",class_ = "text").children
        content_div = [str(x) for x in content_div]
        if not content_div:
            return None
        content_div = ''.join(content_div).replace("<br/></strong>",'</strong><br/>')
        # 将HTML转换为markdown
        markdown_content = html2markdown.convert(content_div)

        return markdown_content
    except Exception as e:
        print(f"获取文章内容时出错: {e}")
        return None

def save_markdown(content, url):
    """保存markdown文件"""
    try:
        # 获取URL的最后一部分作为文件名
        filename = 'bbc-english-at-work-transcript-'+url.rstrip('/').split('/')[-1] + '.md'
        title = url.rstrip('/').split('/')[-1]
        content = f"original url: [{url.rstrip('/').split('/')[-1]}]({url})\n\n" + content
        # 创建output目录(如果不存在)
        if not os.path.exists('output'):
            os.makedirs('output')
            
        # 保存文件
        with open(os.path.join('output', filename), 'w', encoding='utf-8') as f:
            f.write(content)
        add_hugo_post_frontmatter(os.path.join('output', filename), "BBC English at Work 完整台词 "+title,'2025-02-20T15:52:34+08:00', [""])

        return True
    except Exception as e:
        print(f"保存文件时出错: {e}")
        return False

def add_hugo_post_frontmatter(filename, title, date, tags):
    """在markdown文件中添加Hugo的frontmatter"""
    try:
        # 读取文件内容
        with open(filename, 'r', encoding='utf-8') as f:
            content = f.read()

        # 创建frontmatter
        frontmatter = f"""---
title: "{title}"
date: "{date}"
description: "BBC English at Work 完整台词。学习职场英语对话、地道表达和商务沟通技巧。"

categories: 
  - "职场英语"
  - "英语学习"
tags:
  - "BBC English at Work"
  - "商务英语"
  - "职场对话"
  - "英语听力"
  - "英语口语"
  
keywords:
  - "BBC English at Work"
  - "职场英语对话"
  - "商务英语学习"
  - "英语听力练习"
---
"""

        # 将frontmatter添加到文件内容前
        content = frontmatter + content

        # 写入文件
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(content)

        return True
    except Exception as e:
        print(f"添加frontmatter时出错: {e}")
        return False
def main():
    base_url = 'https://www.bbc.com/learningenglish/english/features/english-at-work/'
    
    # 获取所有文章链接
    print("正在获取文章链接...")
    article_links = get_article_links(base_url)
    
    if not article_links:
        print("没有找到任何文章链接")
        return
        
    print(f"找到 {len(article_links)} 个文章链接")
    
    # 处理每个文章
    for i, link in enumerate(article_links, 1):
        print(f"\n处理第 {i} 个文章: {link}")
        
        # 获取文章内容
        content = get_article_content(link)
        if not content:
            print("无法获取文章内容,跳过")
            continue
            
        # 保存文章
        if save_markdown(content, link):
            print("文章保存成功")

        else:
            print("文章保存失败")

if __name__ == "__main__":
    main()