Crawl-Html2MD

2024-12-23

爬取网页HTML转Markdown

依赖库

requests
bs4
html2text
os
json
nest_asyncio
pandas
re

参考输入的 Excel 文件如下

参考文件：下载

代码示例

import requests
import bs4
import html2text
import os
import json
import nest_asyncio
import pandas
import re

# 读取的 excel 配置
read_excel = 'mnews_example.xlsx'
read_sheet = 'mnews'

# 获取当前工作目录
current_directory = os.getcwd()

# 写入目录配置
write_directory = current_directory + '\\data\\'

# 处理特殊字符串正则
pattern = r'[!@#$%^&*()\-+=\'\'\"\":;<>?,.//\[\]_\{\}\\\|]'

# 配置读取df，没有加 header=None 的话，第一行为 列名
read_df = pandas.read_excel(read_excel, sheet_name=read_sheet, index_col=0)
read_df_max = read_df.shape[0]

# while 循环 i 定义
print('★ 开始循环操作')
i = 0
while i < read_df_max:
    # 将值赋值给变量
    site_id = read_df.iloc[i, read_df.columns.get_loc('site_id')]
    title = read_df.iloc[i, read_df.columns.get_loc('title')]
    department = read_df.iloc[i, read_df.columns.get_loc('department')]
    publish_date = read_df.iloc[i, read_df.columns.get_loc('publish_date')]
    url = read_df.iloc[i, read_df.columns.get_loc('url')]
    print('☆ 第' + str(i) + ' 行 抓取的页面信息')
    print('site_id: ' + str(site_id) + '\ntitle: ' + title +
          '\ndepartment: ' + department, 'publish_date: ' + str(publish_date),
          '\nurl: ' + url)

    # 抓取页面转换成 markdown 格式
    r = requests.get(url)
    r.encoding = 'utf-8'
    r.raise_for_status()  # 检查请求是否成功
    htmldata = bs4.BeautifulSoup(r.text, 'html.parser').find_all("div", class_="wp_articlecontent")
    # 使用html2text转换HTML到Markdown
    h = html2text.HTML2Text()
    h.ignore_links = True
    markdown_content = h.handle(str(htmldata))

    # 打印 Markdown 内容
    # print(markdown_content)
    # Markdown 内容附加信息
    markdown_content = '文档分片时带入以下信息：\n文章标题: ' + str(title) + '\n所属部门: ' + str(department) + \
        '\n发布时间: ' + str(publish_date) + '\nurl地址: ' + str(url) + '\n' + markdown_content

    # title 特殊处理
    title = re.sub(pattern, '', title)
    file = open(write_directory + str(title) + '.md', 'w', encoding='utf-8')
    try:
        file.write(markdown_content)
    finally:
        file.close()
    # 循环自增
    i = i + 1

print('☆ 循环结束，抓取完成，请查看' + str(write_directory) + '目录下的文件')