✊ Bookmarklet 网页文本抓取方案 2.0

(\ _ /)
( ・-・)
/っ :hot_beverage:   以前发过一个,不过抓整篇文档页的效果不太好,升级下。

简而言之就是 :down_left_arrow: ( 点 一 下 书 签 )

例 如 这 种 说 明 文 档 :

抓出来就是 Markdown 格式的了 :down_left_arrow: ( 后 续 再 投 喂 给 AI )

Jina、Firecrawl 那种抓 URL 的有两个缺点:① 是以 URL 为单位抓、里面有不少浪费 Tokens 的引导菜单 URL 啥的,洁癖.jpg;② 是这种远程抓取方式容易有鉴权问题。

(\ _ /)
( ・-・)
/っ :cheese_wedge: 总而言之就是这样 ↓ JavaScript 源码 :down_left_arrow:

随便新建个书签 Copy 进「网址」框
javascript(删掉这个中文括号,因为直接发贴会 403):(function(){var s=document.createElement('script');s.src='https://unpkg.com/[email protected]/dist/turndown.js';s.onload=run;s.onerror=function(){alert('Failed to load Turndown.js (CSP?)%27)};document.head.appendChild(s);function run(){var o=document.createElement(%27div%27);o.style.cssText=%27position:absolute;pointer-events:none;z-index:999999;outline:2px solid %2310b981;background:rgba(16,185,129,0.08);transition:all 0.05s%27;document.body.appendChild(o);var c=null;function hl(el){var r=el.getBoundingClientRect();o.style.top=(r.top+scrollY)+%27px%27;o.style.left=(r.left+scrollX)+%27px%27;o.style.width=r.width+%27px%27;o.style.height=r.height+%27px%27}function mv(e){c=e.target;hl(c)}function wh(e){e.preventDefault();if(e.deltaY<0&&c.parentElement&&c.parentElement!==document.documentElement)c=c.parentElement;else if(e.deltaY>0&&c.firstElementChild)c=c.firstElementChild;hl(c)}function cl(e){e.preventDefault();e.stopPropagation();var td=new TurndownService({headingStyle:%27atx%27,codeBlockStyle:%27fenced%27,hr:%27---%27});td.remove([%27script%27,%27style%27,%27noscript%27,%27nav%27,%27footer%27]);var md=td.turndown(c);navigator.clipboard.writeText(md).then(function(){alert(%27\u2705 Markdown copied (%27+md.length+%27 chars)%27)}).catch(function(){var ta=document.createElement(%27textarea%27);ta.value=md;ta.style.cssText=%27position:fixed;opacity:0%27;document.body.appendChild(ta);ta.select();document.execCommand(%27copy%27);document.body.removeChild(ta);alert(%27\u2705 Markdown copied (%27+md.length+%27 chars)%27)});dn()}function dn(){o.remove();document.removeEventListener(%27mouseover%27,mv,true);document.removeEventListener(%27wheel%27,wh,true);document.removeEventListener(%27click%27,cl,true)}document.addEventListener(%27mouseover%27,mv,true);document.addEventListener(%27wheel%27,wh,{capture:true,passive:false});document.addEventListener(%27click%27,cl,true)}})();
17 个赞

前排支持~

2 个赞

如果增加选中后自动弹窗保存为md就更好了

可以的:

javascript(中文括号删掉):(function(){var s=document.createElement("script");s.src="https://unpkg.com/[email protected]/dist/turndown.js";s.onload=run;s.onerror=function(){alert("Failed to load Turndown.js (CSP?)")};document.head.appendChild(s);function run(){var o=document.createElement("div");o.style.cssText="position:absolute;pointer-events:none;z-index:999999;outline:2px solid %2310b981;background:rgba(16,185,129,0.08);transition:all 0.05s";document.body.appendChild(o);var c=null;function hl(el){var r=el.getBoundingClientRect();o.style.top=(r.top+scrollY)+"px";o.style.left=(r.left+scrollX)+"px";o.style.width=r.width+"px";o.style.height=r.height+"px"}function mv(e){c=e.target;hl(c)}function wh(e){e.preventDefault();if(e.deltaY<0&&c.parentElement&&c.parentElement!==document.documentElement)c=c.parentElement;else if(e.deltaY>0&&c.firstElementChild)c=c.firstElementChild;hl(c)}function fb(md,fn){var b=new Blob([md],{type:"text/markdown"});var a=document.createElement("a");a.href=URL.createObjectURL(b);a.download=fn;document.body.appendChild(a);a.click();document.body.removeChild(a);URL.revokeObjectURL(a.href)}function cl(e){e.preventDefault();e.stopPropagation();var td=new TurndownService({headingStyle:"atx",codeBlockStyle:"fenced",hr:"---"});td.remove(["script","style","noscript","nav","footer"]);var md=td.turndown(c);var now=new Date();var ts=now.getFullYear()+String(now.getMonth()+1).padStart(2,"0")+String(now.getDate()).padStart(2,"0")+"_"+String(now.getHours()).padStart(2,"0")+String(now.getMinutes()).padStart(2,"0")+String(now.getSeconds()).padStart(2,"0");var fn=ts+"_"+md.length+".md";if(window.showSaveFilePicker){window.showSaveFilePicker({suggestedName:fn,startIn:"desktop",types:[{description:"Markdown",accept:{"text/markdown":[".md"]}}]}).then(function(h){return h.createWritable().then(function(w){return w.write(md).then(function(){return w.close()})})}).catch(function(err){if(err.name!=="AbortError")fb(md,fn)})}else{fb(md,fn)}dn()}function dn(){o.remove();document.removeEventListener("mouseover",mv,true);document.removeEventListener("wheel",wh,true);document.removeEventListener("click",cl,true)}document.addEventListener("mouseover",mv,true);document.addEventListener("wheel",wh,{capture:true,passive:false});document.addEventListener("click",cl,true)}})()
1 个赞

好东西,感谢!

1 个赞

感谢分享!

2 个赞

为什么不实用crawl4ai

1 个赞

看了下:crawl4ai - 开源项目详情

:distorted_face: 太复杂了,我的需求只是鼠标点一下复制(相当于 Ctrl + A + C 的性质)

然后这种爬虫遇到权限贴就寄,原因就是主题帖的 ①②

感谢,但是也有其他的插件



最近在研究把各种源作为剪藏的素材放进我的本地agent的知识库,但是一众国内平台反爬太严格了,知乎啥的太封闭了,还有wx文章

1 个赞

不错不错

1 个赞