-
-
Notifications
You must be signed in to change notification settings - Fork 184
Description
I'm working on a WebPageDownloader to download a web page completely:
/**
* Downloads a web page and all its assets. The main function of this class is to
* download a web page and convert its resources (images, CSS files, etc.) to base64 format.
* The reason for this is to create a self-contained HTML file that can be opened without
* requiring any external resources, making it easier to view offline or distribute.
*/
public class WebPageDownloader extends WebPageLoader {
private final Encoder resourceEncoder;
public WebPageDownloader(String targetUrl) {
this(targetUrl, null);
}
public WebPageDownloader(String targetUrl, HttpHeadersSpec httpHeadersSpec) {
this(targetUrl, null, httpHeadersSpec);
}
public WebPageDownloader(String targetUrl, String cssSelector, HttpHeadersSpec httpHeadersSpec) {
super(targetUrl, cssSelector, httpHeadersSpec);
this.resourceEncoder = new Encoder();
}
/**
* Downloads a web page and its resources, and converts them to a single HTML string.
* This function is the main entry point for this class.
*/
public byte[] downloadPageWithAssets() throws IOException, InterruptedException {
WebClient webClient = getAvailableWebClient();
try {
HtmlPage page = prepareAndLoadPage(webClient);
waitForSelectorIfNeeded(page);
URL baseUrl = page.getFullyQualifiedUrl(page.getBaseURI());
encodePageResources(page, baseUrl);
removeJavascriptTags(page);
String finalHtml = page.asXml();
return finalHtml.getBytes(StandardCharsets.UTF_8);
} finally {
releaseWebClient(webClient);
}
}
private void waitForSelectorIfNeeded(HtmlPage page) throws InterruptedException {
String cssSelector = getCssSelector();
if (cssSelector != null) {
long startTime = System.currentTimeMillis();
waitForSelector(page);
long executionTime = System.currentTimeMillis() - startTime;
System.out.println("Waited for CSS selector: '" + cssSelector + "' for " + executionTime + " ms");
}
}
private void encodePageResources(HtmlPage page, URL baseUrl) {
List<HtmlElement> elements = page.getByXPath("//*");
for (HtmlElement element : elements) {
encodeResourceAttributes(element, baseUrl);
}
}
private void encodeResourceAttributes(HtmlElement element, URL baseUrl) {
String src = element.getAttribute("src");
String href = element.getAttribute("href");
String style = element.getAttribute("style");
if (!src.isEmpty()) {
encodeResourceAttribute(element, src, baseUrl, "src");
} else if (!href.isEmpty()) {
encodeResourceAttribute(element, href, baseUrl, "href");
} else if (!style.isEmpty()) {
encodeResourceStyle(element, style, baseUrl);
}
}
private void encodeResourceAttribute(HtmlElement element, String url, URL baseUrl, String attributeName) {
String resourceUrl = UrlUtils.resolveUrl(baseUrl, url);
if (resourceUrl.endsWith(".css")) {
replaceCssLinkWithStyle(element, resourceUrl);
} else {
encodeAttribute(element, resourceUrl, attributeName);
}
}
private void replaceCssLinkWithStyle(HtmlElement element, String resourceUrl) {
try {
String cssContent = loadResourceContent(resourceUrl);
String encodedCssContent = encodeResourceInStyle(cssContent, new URL(resourceUrl));
insertStyleBeforeElement(element, encodedCssContent);
element.remove();
} catch (IOException e) {
System.err.println(e.getMessage());
}
}
private void encodeAttribute(HtmlElement element, String resourceUrl, String attributeName) {
String dataUrlBase64 = resourceEncoder.encode(resourceUrl);
element.setAttribute(attributeName, dataUrlBase64);
}
private String loadResourceContent(String resourceUrl) throws IOException {
InputStream cssInStream = resourceEncoder.getUrlAsStream(resourceUrl);
byte[] cssInBytes = cssInStream.readAllBytes();
return new String(cssInBytes, StandardCharsets.UTF_8);
}
private void insertStyleBeforeElement(HtmlElement element, String cssContent) {
DomNode parentNode = element.getParentNode();
HtmlPage page = element.getHtmlPageOrNull();
HtmlStyle newElement = (HtmlStyle) page.createElement("style");
newElement.setAttribute("type", "text/css");
newElement.setTextContent(cssContent);
parentNode.insertBefore(newElement, element);
}
private void encodeResourceStyle(HtmlElement element, String style, URL baseUrl) {
String encodedStyle = encodeResourceInStyle(style, baseUrl);
element.setAttribute("style", encodedStyle);
}
/**
* Encodes URLs inside a CSS or style attribute. This is to make sure the CSS rules are
* self-contained and do not require external resources.
*/
private String encodeResourceInStyle(String style, URL baseUrl) {
Pattern urlPattern = Pattern.compile("url\\(['\"]?(.*?)['\"]?\\)");
Matcher matcher = urlPattern.matcher(style);
StringBuilder encodedStyle = new StringBuilder();
int lastIndex = 0;
while (matcher.find()) {
String resourceUrl = UrlUtils.resolveUrl(baseUrl, matcher.group(1));
// Append everything before the current match
encodedStyle.append(style.substring(lastIndex, matcher.start()));
if (resourceUrl.endsWith(".css")) {
encodedStyle.append(encodeCssResource(matcher, resourceUrl));
} else {
encodedStyle.append(encodeNonCssResource(matcher, resourceUrl));
}
lastIndex = matcher.end();
}
// Append everything after the last match
encodedStyle.append(style.substring(lastIndex));
return encodedStyle.toString();
}
private String encodeCssResource(Matcher matcher, String resourceUrl) {
try {
InputStream cssInStream = resourceEncoder.getUrlAsStream(resourceUrl);
byte[] cssInBytes = cssInStream.readAllBytes();
String cssText = new String(cssInBytes, StandardCharsets.UTF_8);
return "url(" + encodeResourceInStyle(cssText, new URL(resourceUrl)) + ")";
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private String encodeNonCssResource(Matcher matcher, String resourceUrl) {
String dataUrlBase64 = resourceEncoder.encode(resourceUrl);
return "url(" + dataUrlBase64 + ")";
}
private void removeJavascriptTags(HtmlPage page) {
List<HtmlElement> scripts = page.getByXPath("//script");
for(HtmlElement script : scripts){
script.remove();
}
}
}
This works fine with static websites, but for SPA, apps built with Vue, React, and other JS frameworks all it is able to get is a "blank" page, with app dom nodes that are empty, which usually are populated by these frameworks. E.g. <div id="app"></div>
And no matter how I increase the timeout, the HtmlUnit WebClient can't seem to render. I tried to enable throwing of errors in the options and yes, there are Js errors, a lot, and for my application which is to download web pages, there is no way for me to code a "fix" to the website/web page itself since the target website is anywhere from the internet. Plus, every website has different scripts.
A lot of suggestions I get are to use Selenium and Chrome, but I would rather stick with pure Java implementation which is HtmlUnit.
The implementation of the WebPageLoader can be found here.