0% found this document useful (0 votes)
8 views2 pages

PDFScraper

The document is a C program that downloads PDF files from a specified URL. It uses the libcurl library to fetch HTML content and the libxml library to parse the HTML for links to PDF files. The program extracts these links, constructs the full URLs, and downloads the files to the local system.

Uploaded by

pedronunezwork
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views2 pages

PDFScraper

The document is a C program that downloads PDF files from a specified URL. It uses the libcurl library to fetch HTML content and the libxml library to parse the HTML for links to PDF files. The program extracts these links, constructs the full URLs, and downloads the files to the local system.

Uploaded by

pedronunezwork
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

#include <stdio.

h>
#include <stdlib.h>
#include <string.h>
#include <libxml/HTMLparser.h>
#include <curl/curl.h>

struct MemoryStruct {
char *memory;
size_t size;
};

static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void


*userp) {
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
mem->memory = realloc(mem->memory, mem->size + realsize + 1);
if(mem->memory == NULL) return 0;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}

void download_file(const char *url, const char *filename) {


FILE *fp = fopen(filename, "wb");
if (!fp) return;
CURL *curl = curl_easy_init();
if (!curl) return;
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
curl_easy_perform(curl);
curl_easy_cleanup(curl);
fclose(fp);
}

void find_and_download_pdfs(const char *html, const char *base_url) {


htmlDocPtr doc = htmlReadMemory(html, strlen(html), base_url, NULL,
HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return;
xmlNode *root = xmlDocGetRootElement(doc);
xmlNode *cur = root;

for (; cur; cur = xmlNextNode(cur)) {


if (cur->type == XML_ELEMENT_NODE && strcmp((const char *)cur->name, "a")
== 0) {
xmlChar *href = xmlGetProp(cur, (const xmlChar *)"href");
if (href && strstr((const char *)href, ".pdf")) {
char full_url[1024];
if (strstr((const char *)href, "http") == (char *)href) {
snprintf(full_url, sizeof(full_url), "%s", (const char *)href);
} else {
snprintf(full_url, sizeof(full_url), "%s/%s", base_url, (const
char *)href);
}
const char *filename = strrchr(full_url, '/');
if (filename && *(filename + 1)) {
download_file(full_url, filename + 1);
}
xmlFree(href);
}
}
}

xmlFreeDoc(doc);
xmlCleanupParser();
}

int main(int argc, char **argv) {


if (argc != 2) {
printf("Usage: %s <url>\n", argv[0]);
return 1;
}

CURL *curl = curl_easy_init();


if (!curl) return 1;

struct MemoryStruct chunk = { malloc(1), 0 };


curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
curl_easy_perform(curl);
curl_easy_cleanup(curl);

find_and_download_pdfs([Link], argv[1]);

free([Link]);
return 0;
}

You might also like