爬取网页HTML转Markdown
依赖库
1 2 3 4 5 6 7 8
| requests bs4 html2text os json nest_asyncio pandas re
|
参考输入的 Excel 文件如下
参考文件:下载
代码示例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
| import requests import bs4 import html2text import os import json import nest_asyncio import pandas import re
# 读取的 excel 配置 read_excel = 'mnews_example.xlsx' read_sheet = 'mnews'
# 获取当前工作目录 current_directory = os.getcwd()
# 写入目录配置 write_directory = current_directory + '\\data\\'
# 处理特殊字符串正则 pattern = r'[!@#$%^&*()\-+=\'\'\"\":;<>?,.//\[\]_\{\}\\\|]'
# 配置读取df,没有加 header=None 的话,第一行为 列名 read_df = pandas.read_excel(read_excel, sheet_name=read_sheet, index_col=0) read_df_max = read_df.shape[0]
# while 循环 i 定义 print('★ 开始循环操作') i = 0 while i < read_df_max: # 将值赋值给变量 site_id = read_df.iloc[i, read_df.columns.get_loc('site_id')] title = read_df.iloc[i, read_df.columns.get_loc('title')] department = read_df.iloc[i, read_df.columns.get_loc('department')] publish_date = read_df.iloc[i, read_df.columns.get_loc('publish_date')] url = read_df.iloc[i, read_df.columns.get_loc('url')] print('☆ 第' + str(i) + ' 行 抓取的页面信息') print('site_id: ' + str(site_id) + '\ntitle: ' + title + '\ndepartment: ' + department, 'publish_date: ' + str(publish_date), '\nurl: ' + url)
# 抓取页面转换成 markdown 格式 r = requests.get(url) r.encoding = 'utf-8' r.raise_for_status() # 检查请求是否成功 htmldata = bs4.BeautifulSoup(r.text, 'html.parser').find_all("div", class_="wp_articlecontent") # 使用html2text转换HTML到Markdown h = html2text.HTML2Text() h.ignore_links = True markdown_content = h.handle(str(htmldata))
# 打印 Markdown 内容 # print(markdown_content) # Markdown 内容附加信息 markdown_content = '文档分片时带入以下信息:\n文章标题: ' + str(title) + '\n所属部门: ' + str(department) + \ '\n发布时间: ' + str(publish_date) + '\nurl地址: ' + str(url) + '\n' + markdown_content
# title 特殊处理 title = re.sub(pattern, '', title) file = open(write_directory + str(title) + '.md', 'w', encoding='utf-8') try: file.write(markdown_content) finally: file.close() # 循环自增 i = i + 1
print('☆ 循环结束,抓取完成,请查看' + str(write_directory) + '目录下的文件')
|