Skip to content

Commit 8acb567

Browse files
authored
Merge pull request #1469 from harehare/fix/html-to-markdown-content-extraction
🐛 fix(mq-crawler): improve HTML to Markdown conversion for pages withsidebars and JS
2 parents 4cb7a0e + 53aa48a commit 8acb567

File tree

11 files changed

+719
-220
lines changed

11 files changed

+719
-220
lines changed

Cargo.lock

Lines changed: 118 additions & 102 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ deadpool-libsql = "0.1.0"
4949
dirs = "6.0.0"
5050
divan = {version = "3.0.5", package = "codspeed-divan-compat"}
5151
ego-tree = "0.10.0"
52+
chromiumoxide = {version = "0.9", default-features = false}
5253
fantoccini = {version = "0.22.1", default-features = false, features = ["rustls-tls"]}
54+
rustls = {version = "0.23", default-features = false, features = ["ring"]}
5355
futures = "0.3"
5456
httpmock = "0.8.2"
5557
itertools = "0.14.0"
@@ -68,7 +70,7 @@ proptest = "1.10"
6870
pyo3 = "0.28.2"
6971
rayon = "1.11.0"
7072
regex-lite = "0.1.9"
71-
reqwest = {version = "0.12", default-features = false}
73+
reqwest = {version = "0.13", default-features = false}
7274
robots_txt = "0.7"
7375
ropey = "1.6"
7476
rstest = "0.26.1"

crates/mq-crawler/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,16 @@ repository = "https://github.com/harehare/mq"
1212
version = "0.5.19"
1313

1414
[dependencies]
15+
chromiumoxide = {workspace = true}
1516
clap = {workspace = true, features = ["derive"]}
1617
crossbeam = {workspace = true}
1718
dashmap = {workspace = true}
18-
fantoccini = {workspace = true}
19+
fantoccini = {workspace = true, features = ["rustls-tls"]}
1920
futures = {workspace = true}
2021
miette = {workspace = true, features = ["fancy"]}
2122
mq-lang = {workspace = true}
2223
mq-markdown = {workspace = true}
23-
reqwest = {workspace = true, features = ["json", "rustls-tls"]}
24+
reqwest = {workspace = true, features = ["json"]}
2425
robots_txt = {workspace = true}
2526
scraper = {workspace = true}
2627
serde = {workspace = true, features = ["derive"]}

0 commit comments

Comments
 (0)