Crawl-Html2MD

爬取网页HTML转Markdown

依赖库

1
2
3
4
5
6
7
8
requests
bs4
html2text
os
json
nest_asyncio
pandas
re

参考输入的 Excel 文件如下

参考文件:下载

代码示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
import bs4
import html2text
import os
import json
import nest_asyncio
import pandas
import re

# 读取的 excel 配置
read_excel = 'mnews_example.xlsx'
read_sheet = 'mnews'

# 获取当前工作目录
current_directory = os.getcwd()

# 写入目录配置
write_directory = current_directory + '\\data\\'

# 处理特殊字符串正则
pattern = r'[!@#$%^&*()\-+=\'\'\"\":;<>?,.//\[\]_\{\}\\\|]'

# 配置读取df,没有加 header=None 的话,第一行为 列名
read_df = pandas.read_excel(read_excel, sheet_name=read_sheet, index_col=0)
read_df_max = read_df.shape[0]

# while 循环 i 定义
print('★ 开始循环操作')
i = 0
while i < read_df_max:
# 将值赋值给变量
site_id = read_df.iloc[i, read_df.columns.get_loc('site_id')]
title = read_df.iloc[i, read_df.columns.get_loc('title')]
department = read_df.iloc[i, read_df.columns.get_loc('department')]
publish_date = read_df.iloc[i, read_df.columns.get_loc('publish_date')]
url = read_df.iloc[i, read_df.columns.get_loc('url')]
print('☆ 第' + str(i) + ' 行 抓取的页面信息')
print('site_id: ' + str(site_id) + '\ntitle: ' + title +
'\ndepartment: ' + department, 'publish_date: ' + str(publish_date),
'\nurl: ' + url)

# 抓取页面转换成 markdown 格式
r = requests.get(url)
r.encoding = 'utf-8'
r.raise_for_status() # 检查请求是否成功
htmldata = bs4.BeautifulSoup(r.text, 'html.parser').find_all("div", class_="wp_articlecontent")
# 使用html2text转换HTML到Markdown
h = html2text.HTML2Text()
h.ignore_links = True
markdown_content = h.handle(str(htmldata))

# 打印 Markdown 内容
# print(markdown_content)
# Markdown 内容附加信息
markdown_content = '文档分片时带入以下信息:\n文章标题: ' + str(title) + '\n所属部门: ' + str(department) + \
'\n发布时间: ' + str(publish_date) + '\nurl地址: ' + str(url) + '\n' + markdown_content

# title 特殊处理
title = re.sub(pattern, '', title)
file = open(write_directory + str(title) + '.md', 'w', encoding='utf-8')
try:
file.write(markdown_content)
finally:
file.close()
# 循环自增
i = i + 1

print('☆ 循环结束,抓取完成,请查看' + str(write_directory) + '目录下的文件')
Contents
  1. 1. 爬取网页HTML转Markdown
    1. 1.1. 依赖库
    2. 1.2. 参考输入的 Excel 文件如下
    3. 1.3. 代码示例
|