
CNKI(中国知网)是国内核心学术文献数据库,文献关键词、元数据、引文等信息可有效表征领域研究脉络与热点分布。基于采集数据开展关键词共现分析,能够量化挖掘领域研究主题、主题间关联关系与知识群落结构。本文基于 Playwright 实现知网动态页面数据采集,结合亿牛云固定转发代理规避 IP 封禁风险,依托 NetworkX 完成关键词共现网络建模,并通过 pyecharts 实现图谱可视化,形成一套完整的学术数据采集与网络分析技术方案。版权声明:知网文献内容受著作权相关法律法规保护,本文技术方案仅限个人学术研究场景使用,商用及公开传播数据需提前获得官方授权。一、整体技术架构与数据采集方案知网检索列表与论文详情页均采用 JS 异步渲染机制,无法通过静态请求直接解析数据。本次方案技术栈:Playwright(动态页面渲染)+ 代理服务(反封禁)+ 数据解析 + 关键词共现算法 + 图可视化。1.1 采集字段设计结合学术分析需求,确定采集字段、数据来源及业务用途如下:表格
字段 | 数据来源 | 核心用途 |
|---|---|---|
论文标题 | 论文详情页 | 文献唯一标识 |
作者 / 所属单位 | 论文详情页 | 科研合作网络分析 |
摘要 | 论文详情页 | 文本特征挖掘 |
关键词 | 论文详情页 | 共现网络核心节点 |
引用 / 被引数据 | 论文详情页 | 文献引证网络分析 |
1.2 环境依赖与代理配置1.2.2 代理选型与接入知网具备严格的访问风控策略,高频持续访问易触发 IP 限流、封禁。方案采用亿牛云固定转发代理适配浏览器自动化场景:代理 IP 有效时长 1~3 分钟,可覆盖单页面完整加载周期,配套海量 IP 池可支撑批量采集任务。代理标准接入地址:http://user:[email protected]:31111,通过 Chromium 启动参数全局配置代理,无需单次请求重复设置。二、核心代码实现2.1 数据实体与浏览器代理封装基于数据类统一管理论文实体,封装带反爬、代理能力的 Playwright 浏览器实例,关闭自动化特征以规避页面检测。python运行
import asyncio
import json
import random
import time
import hashlib
from dataclasses import dataclass, field
from typing import Optional
from playwright.async_api import async_playwright
# 论文数据实体
@dataclass
class Paper:
title: str = ""
authors: list[str] = field(default_factory=list)
orgs: list[str] = field(default_factory=list)
abstract: str = ""
keywords: list[str] = field(default_factory=list)
year: str = ""
source: str = ""
cited_count: int = 0
url: str = ""
uid: str = ""
def __post_init__(self):
# 基于URL生成唯一摘要ID
if not self.uid and self.url:
self.uid = hashlib.md5(self.url.encode()).hexdigest()[:12]
# 集成代理与反检测的浏览器类
class CnkiBrowser:
def __init__(self, proxy_user: str = "", proxy_pass: str = ""):
self.proxy_user = proxy_user
self.proxy_pass = proxy_pass
self.use_proxy = bool(proxy_user and proxy_pass)
self._playwright = None
self.browser = None
self.context = None
self.page = None
async def launch(self):
# 浏览器启动参数:关闭自动化特征、沙箱
launch_args = [
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
]
# 配置代理
if self.use_proxy:
proxy_addr = f"http://{self.proxy_user}:{self.proxy_pass}@t.16yun.cn:31111"
launch_args.append(f"--proxy-server={proxy_addr}")
self._playwright = await async_playwright().start()
self.browser = await self._playwright.chromium.launch(
headless=True, args=launch_args
)
# 初始化上下文与页面,配置UA、视口
self.context = await self.browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
viewport={"width": 1920, "height": 1080},
)
# 注入脚本清除webdriver特征
await self.context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
""")
self.page = await self.context.new_page()
return self.page
async def close(self):
# 资源释放
if self.browser:
await self.browser.close()
if self._playwright:
await self._playwright.stop()2.2 检索与详情页数据采集实现关键词分页检索、论文 URL 提取、单篇文献全字段解析,增设随机请求间隔模拟人工访问,提升采集稳定性。python运行
from bs4 import BeautifulSoup
class CnkiScraper:
SEARCH_URL = "https://kns.cnki.net/kns8s/defaultresult/index"
def __init__(self, browser: CnkiBrowser):
self.b = browser
async def search(self, keyword: str, max_pages: int = 5, delay: float = 4.0) -> list[str]:
"""关键词分页检索,提取论文详情页链接"""
url_list = []
for page in range(1, max_pages + 1):
req_url = (
f"{self.SEARCH_URL}?crossids=Zk0sMk1s&korder=SU"
f"&kw={keyword}&boolsearch=false&pageNum={page}"
)
try:
await self.b.page.goto(req_url, wait_until="networkidle", timeout=30000)
await asyncio.sleep(delay + random.uniform(1, 3))
html = await self.b.page.content()
soup = BeautifulSoup(html, "html.parser")
# 解析详情页链接
link_items = soup.select("a[href*='abstract']")
for link in link_items:
href = link.get("href", "")
full_url = f"https://kns.cnki.net{href}" if href.startswith("/") else href
if full_url not in url_list:
url_list.append(full_url)
print(f"第{page}页采集完成,累计获取链接:{len(url_list)} 条")
if not link_items:
break
except Exception as e:
print(f"第{page}页检索异常:{str(e)}")
break
return url_list
async def fetch_papers(self, urls: list[str], delay: float = 3.0) -> list[Paper]:
"""批量采集论文详情数据"""
paper_list = []
for idx, url in enumerate(urls):
try:
paper = await self._fetch_single_paper(url)
if paper.title:
paper_list.append(paper)
print(f"[{idx+1}/{len(urls)}] 采集成功:{paper.title[:30]},关键词数量:{len(paper.keywords)}")
except Exception as e:
print(f"[{idx+1}] 链接采集失败:{str(e)}")
await asyncio.sleep(delay + random.uniform(0.5, 2))
return paper_list
async def _fetch_single_paper(self, url: str) -> Paper:
"""解析单篇论文全字段"""
paper = Paper(url=url)
await self.b.page.goto(url, wait_until="networkidle", timeout=30000)
await asyncio.sleep(2)
# 提取标题
title_elem = await self.b.page.query_selector("h1")
if title_elem:
paper.title = (await title_elem.inner_text()).strip()
# 页面JS批量提取结构化数据
page_data = await self.b.page.evaluate("""
() => {
const res = {authors:[], orgs:[], abstract:'', keywords:[], year:'', source:'', cited:0};
document.querySelectorAll('[class*="author"] a').forEach(a => {
let text = a.textContent.trim();
text && res.authors.push(text);
});
document.querySelectorAll('[class*="org"]').forEach(o => {
let text = o.textContent.trim();
text && text.length < 100 && res.orgs.push(text);
});
let absNode = document.querySelector('[class*="abstract"]');
if(absNode) res.abstract = absNode.textContent.replace('摘要:', '').trim();
document.querySelectorAll('[class*="keyword"] a').forEach(k => {
let text = k.textContent.trim();
text && text.length < 30 && res.keywords.push(text);
});
let dateNode = document.querySelector('[class*="date"]');
if(dateNode){
let yearMatch = dateNode.textContent.match(/\d{4}/);
yearMatch && (res.year = yearMatch[0]);
}
return res;
}
""")
paper.authors = page_data.get("authors", [])
paper.orgs = page_data.get("orgs", [])
paper.abstract = page_data.get("abstract", "")
paper.keywords = page_data.get("keywords", [])
paper.year = page_data.get("year", "")
return paper2.3 关键词共现网络建模基于共现规则构建无向图:同一篇文献中同时出现的两个关键词建立连接,共现频次作为边权重;结合节点频次、度中心性、社区划分完成多维分析。python运行
from collections import defaultdict, Counter
from itertools import combinations
import networkx as nx
class KeywordCooccurrence:
def __init__(self, papers: list[Paper]):
self.papers = papers
self.graph = nx.Graph()
def build_graph(self, min_count: int = 2):
"""构建关键词共现网络,过滤低频次节点与边"""
edge_weight = defaultdict(int)
node_frequency = Counter()
# 统计节点与共现边权重
for paper in self.papers:
kw_list = list(set(k.strip() for k in paper.keywords if k.strip()))
node_frequency.update(kw_list)
for pair in combinations(sorted(kw_list), 2):
edge_weight[pair] += 1
# 筛选Top50高频关键词作为节点
top_nodes = dict(node_frequency.most_common(50))
for kw, cnt in top_nodes.items():
self.graph.add_node(kw, count=cnt)
# 筛选有效边
for (kw1, kw2), weight in edge_weight.items():
if kw1 in self.graph.nodes and kw2 in self.graph.nodes and weight >= min_count:
self.graph.add_edge(kw1, kw2, weight=weight)
print(f"共现网络构建完成 | 节点数:{self.graph.number_of_nodes()},边数:{self.graph.number_of_edges()}")
return self.graph
def get_top_keywords(self, top_n: int = 20) -> list:
"""获取频次TopN关键词"""
return sorted(self.graph.nodes(data=True), key=lambda x: x[1].get("count", 0), reverse=True)[:top_n]
def get_degree_centrality(self, top_n: int = 15) -> list:
"""计算度中心性,识别枢纽关键词"""
centrality = nx.degree_centrality(self.graph)
return sorted(centrality.items(), key=lambda x: -x[1])[:top_n]
def detect_community(self) -> dict:
"""Louvain算法划分研究子主题群落"""
from networkx.algorithms.community import louvain_communities
communities = louvain_communities(self.graph, seed=42)
result = {}
for idx, comm in enumerate(communities):
if len(comm) >= 3:
result[f"子主题{idx+1}"] = list(comm)
return result2.4 网络图谱可视化基于 pyecharts 实现交互式图谱渲染,节点大小关联关键词出现频次,边粗细映射共现强度,支持节点拖拽交互。python运行
from pyecharts import options as opts
from pyecharts.charts import Graph
class GraphVisualizer:
def render_html(self, graph, output_name="keyword_graph.html", min_weight=2, max_nodes=50):
"""生成共现图谱HTML文件"""
# 筛选核心节点
core_nodes = sorted(graph.nodes, key=lambda n: graph.nodes[n].get("count", 0), reverse=True)[:max_nodes]
# 构造节点数据
nodes = []
for node in core_nodes:
nodes.append({
"name": node,
"symbolSize": 15 + graph.degree(node) * 3,
"value": graph.nodes[node].get("count", 0)
})
# 构造边数据
links = []
for u, v, attr in graph.edges(data=True):
if u in core_nodes and v in core_nodes and attr.get("weight", 0) >= min_weight:
links.append({
"source": u,
"target": v,
"value": attr.get("weight", 1),
"lineStyle": {"width": min(attr.get("weight", 1), 8)}
})
# 初始化图表
chart = (
Graph(init_opts=opts.InitOpts(width="1400px", height="900px"))
.add(
series_name="",
nodes=nodes,
links=links,
repulsion=2000,
is_draggable=True,
label_opts=opts.LabelOpts(is_show=True, font_size=12)
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="领域研究热点关键词共现图谱",
subtitle="节点大小=关键词出现频次 | 连线粗细=关键词共现强度"
)
)
)
chart.render(output_name)
print(f"可视化图谱已生成:{output_name}")2.5 端到端调度流程串联检索、采集、分析、可视化全流程,统一调度各模块并输出分析结果。python运行
async def run_task(keyword: str = "深度学习", max_papers: int = 100):
# 初始化浏览器(填入你的代理账号密码)
browser = CnkiBrowser(proxy_user="your_user", proxy_pass="your_password")
await browser.launch()
scraper = CnkiScraper(browser)
# 1. 检索论文链接
print("===== 开始检索文献 =====")
url_list = (await scraper.search(keyword, max_pages=8))[:max_papers]
# 2. 批量采集文献数据
print("\n===== 开始采集文献详情 =====")
paper_data = await scraper.fetch_papers(url_list, delay=3.0)
print(f"文献采集完成,有效数据量:{len(paper_data)} 篇")
# 3. 关键词共现分析
print("\n===== 构建关键词共现网络 =====")
analyzer = KeywordCooccurrence(paper_data)
graph = analyzer.build_graph(min_count=2)
# 输出Top关键词
print("\n===== 高频关键词TOP20 =====")
for kw, info in analyzer.get_top_keywords(20):
print(f"{info['count']:>4} 次 | {kw}")
# 输出主题群落
print("\n===== 研究子主题群落 =====")
community_data = analyzer.detect_community()
for name, members in community_data.items():
print(f"{name}(共{len(members)}个关键词):{', '.join(members[:5])}")
# 4. 生成可视化图谱
visual = GraphVisualizer()
visual.render_html(graph, f"graph_{keyword}.html", min_weight=2, max_nodes=50)
# 释放资源
await browser.close()
if __name__ == "__main__":
asyncio.run(run_task(keyword="深度学习", max_papers=100))三、代理接入要点说明适配性优势:固定转发代理 IP 有效期 1~3 分钟,匹配 Playwright 页面完整加载时长,避免 IP 切换导致请求中断;海量 IP 池可支撑规模化采集。接入方式:本方案基于 Chromium 启动参数全局配置代理,适用于浏览器自动化场景;若使用requests/aiohttp等同步请求库,可通过Proxy-Tunnel请求头动态切换 IP。异常排查:出现407错误时,优先校验代理账号、密码有效性。四、输出结果解读运行程序后将生成交互式 HTML 图谱,结合数据输出可完成领域分析:核心节点:图谱中尺寸最大的节点为领域主流研究关键词;关联强度:连线越粗,代表两个关键词在文献中共同出现频次越高,主题关联越紧密;群落划分:算法自动划分的子主题,对应领域下不同细分研究方向。五、常见问题与优化方案检索无结果 / 访问被拦截 优化:校验 UA 与反检测脚本有效性,延长请求间隔至 4s 以上;结合代理服务降低风控触发概率,必要时引入 Cookie 池。关键词字段为空 原因:知网页面迭代导致选择器失效。优化:通过浏览器开发者工具重新定位 DOM 节点,更新页面解析选择器。图谱结构杂乱 优化:缩小检索关键词范围、提高共现频次阈值、减少展示节点数量。文本乱码 优化:知网部分页面采用 GBK 编码,可引入编码检测库自动适配解码格式。六、方案边界与注意事项采集规模:建议单任务采集量控制在 100~200 篇,高频批量采集仍需严格控制请求频率,规避风控。合规要求:本技术仅用于个人学术研究,严禁未经授权抓取、传播、商用知网文献数据。分析局限性:关键词共现仅表征关联关系,无法推导因果逻辑;节点中心性仅代表主题枢纽性,不等同于研究价值高低。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 [email protected] 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 [email protected] 删除。