PDFScraper

The document is a C program that downloads PDF files from a specified URL. It uses the libcurl library to fetch HTML content and the libxml library to parse the HTML for links to PDF files. The program extracts these links, constructs the full URLs, and downloads the files to the local system.

Uploaded by

pedronunezwork

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

8 views2 pages

PDFScraper

Uploaded by

pedronunezwork

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

#include <stdio.

h>
#include <stdlib.h>
#include <string.h>
#include <libxml/HTMLparser.h>
#include <curl/curl.h>

struct MemoryStruct {
char *memory;
size_t size;
};

static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void

*userp) {
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
mem->memory = realloc(mem->memory, mem->size + realsize + 1);
if(mem->memory == NULL) return 0;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}

void download_file(const char url, const char filename) {

FILE *fp = fopen(filename, "wb");
if (!fp) return;
CURL *curl = curl_easy_init();
if (!curl) return;
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
curl_easy_perform(curl);
curl_easy_cleanup(curl);
fclose(fp);
}

void find_and_download_pdfs(const char html, const char base_url) {

htmlDocPtr doc = htmlReadMemory(html, strlen(html), base_url, NULL,
HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
if (!doc) return;
xmlNode *root = xmlDocGetRootElement(doc);
xmlNode *cur = root;

for (; cur; cur = xmlNextNode(cur)) {

if (cur->type == XML_ELEMENT_NODE && strcmp((const char *)cur->name, "a")
== 0) {
xmlChar *href = xmlGetProp(cur, (const xmlChar *)"href");
if (href && strstr((const char *)href, ".pdf")) {
char full_url[1024];
if (strstr((const char *)href, "http") == (char *)href) {
snprintf(full_url, sizeof(full_url), "%s", (const char *)href);
} else {
snprintf(full_url, sizeof(full_url), "%s/%s", base_url, (const
char *)href);
}
const char *filename = strrchr(full_url, '/');
if (filename && *(filename + 1)) {
download_file(full_url, filename + 1);
}
xmlFree(href);
}
}
}

xmlFreeDoc(doc);
xmlCleanupParser();
}

int main(int argc, char **argv) {

if (argc != 2) {
printf("Usage: %s <url>\n", argv[0]);
return 1;
}

CURL *curl = curl_easy_init();

if (!curl) return 1;

struct MemoryStruct chunk = { malloc(1), 0 };

curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
curl_easy_perform(curl);
curl_easy_cleanup(curl);

find_and_download_pdfs([Link], argv[1]);

free([Link]);
return 0;
}

Web Scraping in C: A 2024 Guide
No ratings yet
Web Scraping in C: A 2024 Guide
28 pages
Downloader
No ratings yet
Downloader
1 page
Web Downloader C
No ratings yet
Web Downloader C
3 pages
Email Spammer T XT
No ratings yet
Email Spammer T XT
2 pages
Libcurl Programming Tutorial
No ratings yet
Libcurl Programming Tutorial
28 pages
C++ Requests - Curl For People - Huu Nguyen - CppCon 2015
No ratings yet
C++ Requests - Curl For People - Huu Nguyen - CppCon 2015
37 pages
2 Times On and Off Red Blue
No ratings yet
2 Times On and Off Red Blue
18 pages
CMake Lists
No ratings yet
CMake Lists
3 pages
Programming With Curlpp: 1 About This Document
No ratings yet
Programming With Curlpp: 1 About This Document
18 pages
File
No ratings yet
File
22 pages
CTP M5 CH1, CH2
No ratings yet
CTP M5 CH1, CH2
18 pages
Curl Easy Setopt
No ratings yet
Curl Easy Setopt
11 pages
Lab Compiller Joy
No ratings yet
Lab Compiller Joy
3 pages
Libcurl Tutorial
No ratings yet
Libcurl Tutorial
22 pages
Message
No ratings yet
Message
3 pages
Malicious PDF Creation Guide
No ratings yet
Malicious PDF Creation Guide
13 pages
Import Public Class Public Static Void: Args Urlstring
No ratings yet
Import Public Class Public Static Void: Args Urlstring
8 pages
Fall 2021 - CS311 - 2 - BC130201745
No ratings yet
Fall 2021 - CS311 - 2 - BC130201745
3 pages
HTTP Proxy Implementation Guide
No ratings yet
HTTP Proxy Implementation Guide
4 pages
A Simple Python Web Crawler...
100% (1)
A Simple Python Web Crawler...
5 pages
TODO
No ratings yet
TODO
20 pages
cURL CMake Build System Script
No ratings yet
cURL CMake Build System Script
15 pages
Util
No ratings yet
Util
64 pages
CS32 Student Search Engine Project
No ratings yet
CS32 Student Search Engine Project
26 pages
Function Module QR Code
No ratings yet
Function Module QR Code
14 pages
Anställningsdokument PDF
No ratings yet
Anställningsdokument PDF
436 pages
Uri and Uuid: Identifying Things On The Web
No ratings yet
Uri and Uuid: Identifying Things On The Web
16 pages
PHP Web Crawler Guide for Developers
No ratings yet
PHP Web Crawler Guide for Developers
3 pages
HTML SRC List
0% (2)
HTML SRC List
4 pages
Web Technologies Lab Manual
No ratings yet
Web Technologies Lab Manual
60 pages
REST API Guide for Developers
No ratings yet
REST API Guide for Developers
99 pages
WS Term End Lab L10+L11
No ratings yet
WS Term End Lab L10+L11
7 pages
Multithreading Crawler Project OS
No ratings yet
Multithreading Crawler Project OS
11 pages
Web Mining Techniques and Code
No ratings yet
Web Mining Techniques and Code
11 pages
Shared Directory File Management in C
100% (1)
Shared Directory File Management in C
13 pages
Apportable Open Source Contributions
No ratings yet
Apportable Open Source Contributions
6 pages
RCurl: R HTTP/FTP Client Interface
No ratings yet
RCurl: R HTTP/FTP Client Interface
63 pages
Web Crawler Guide for Developers
No ratings yet
Web Crawler Guide for Developers
20 pages
C++ Web Services for Developers
No ratings yet
C++ Web Services for Developers
40 pages
Data Engineering Concepts #2 - Sending Data Using An API - by Bar Dadon - Dev Genius
No ratings yet
Data Engineering Concepts #2 - Sending Data Using An API - by Bar Dadon - Dev Genius
14 pages
New Text Document
No ratings yet
New Text Document
3 pages
TODO
No ratings yet
TODO
24 pages
Module5 Q&A
No ratings yet
Module5 Q&A
6 pages
Web Scraping with Python and urllib
100% (1)
Web Scraping with Python and urllib
57 pages
Android Web Connectivity
No ratings yet
Android Web Connectivity
10 pages
CMake Lists
No ratings yet
CMake Lists
31 pages
TODO
No ratings yet
TODO
24 pages
Web Scraping With PHP
No ratings yet
Web Scraping With PHP
14 pages
Unit 4
No ratings yet
Unit 4
36 pages
Libcurl Programming Guide
No ratings yet
Libcurl Programming Guide
10 pages
XML 2
No ratings yet
XML 2
28 pages
Curl Easy Getinfo
No ratings yet
Curl Easy Getinfo
5 pages
Unit 3 Urls and Uris
No ratings yet
Unit 3 Urls and Uris
50 pages
KM (804 2)
No ratings yet
KM (804 2)
2 pages
Elasticsearch Beginner's Guide
No ratings yet
Elasticsearch Beginner's Guide
16 pages
PHP and Web Services: Perfect Partners
No ratings yet
PHP and Web Services: Perfect Partners
55 pages
Document PDF
No ratings yet
Document PDF
1 page
libcurl C Programming Guide
No ratings yet
libcurl C Programming Guide
3 pages
ESP32 Simple Web Server Code
No ratings yet
ESP32 Simple Web Server Code
13 pages
Homo Connectus: The Impact of Technology On People's Everyday Lives
No ratings yet
Homo Connectus: The Impact of Technology On People's Everyday Lives
50 pages
New IGL Brochure
No ratings yet
New IGL Brochure
2 pages
005-Use of HTLS Conductors
100% (2)
005-Use of HTLS Conductors
153 pages
Frankfurt Public Transport Guide
No ratings yet
Frankfurt Public Transport Guide
21 pages
Welding Procedure Specification Register: Company Address
No ratings yet
Welding Procedure Specification Register: Company Address
20 pages
GB - T 10095.2-2008
No ratings yet
GB - T 10095.2-2008
5 pages
Safety-Critical Systems in Rail Transportation
No ratings yet
Safety-Critical Systems in Rail Transportation
5 pages
Thuyết trình robot TIẾNG ANH
No ratings yet
Thuyết trình robot TIẾNG ANH
2 pages
Integrated Terminal Automation System
No ratings yet
Integrated Terminal Automation System
6 pages
Comprehensive Organic Name Reactions and Reagents by Zerong Wang Imennye Reaktsii PDF
100% (2)
Comprehensive Organic Name Reactions and Reagents by Zerong Wang Imennye Reaktsii PDF
2,558 pages
SRAM 2008 Torpedo
No ratings yet
SRAM 2008 Torpedo
64 pages
User Guide: The Drill Bit Sharpener
No ratings yet
User Guide: The Drill Bit Sharpener
36 pages
Fire Watch Standby Man Training Presentation (1) - 110028
100% (2)
Fire Watch Standby Man Training Presentation (1) - 110028
54 pages
GMS 4.06 Web Client - Operator Guide
No ratings yet
GMS 4.06 Web Client - Operator Guide
28 pages
Typical Problems Found On Pneumatic Manoeuvring System
No ratings yet
Typical Problems Found On Pneumatic Manoeuvring System
4 pages
MDKDS Manual Partes
100% (1)
MDKDS Manual Partes
98 pages
Pre-Tensioning vs Post-Tensioning Methods
No ratings yet
Pre-Tensioning vs Post-Tensioning Methods
2 pages
Overview of Styrene-Butadiene Rubber (SBR)
No ratings yet
Overview of Styrene-Butadiene Rubber (SBR)
20 pages
Site Non-Conformity Management Procedure
100% (1)
Site Non-Conformity Management Procedure
11 pages
Circular CD Slide Rule
No ratings yet
Circular CD Slide Rule
9 pages
Qatargas Acid Gas Treatment Upgrade
No ratings yet
Qatargas Acid Gas Treatment Upgrade
10 pages
Solution Lec 2
No ratings yet
Solution Lec 2
8 pages
G3 PDF
No ratings yet
G3 PDF
81 pages
Usborne Guide To Electronics of The 80s PDF
100% (1)
Usborne Guide To Electronics of The 80s PDF
27 pages
Basic Load Cases Used For Piping Stress Analysis
No ratings yet
Basic Load Cases Used For Piping Stress Analysis
5 pages
Generación eólica-M.Borja Et Al.
No ratings yet
Generación eólica-M.Borja Et Al.
6 pages
Cuestionario de Condiciones Laborales y Capacitación
No ratings yet
Cuestionario de Condiciones Laborales y Capacitación
5 pages
SPG Test Report for SOM Crusher
No ratings yet
SPG Test Report for SOM Crusher
1 page
SHLR Product Description
100% (1)
SHLR Product Description
48 pages

PDFScraper

Uploaded by

PDFScraper

Uploaded by

#include <stdio.

static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void

void download_file(const char *url, const char *filename) {

void find_and_download_pdfs(const char *html, const char *base_url) {

for (; cur; cur = xmlNextNode(cur)) {

int main(int argc, char **argv) {

CURL *curl = curl_easy_init();

struct MemoryStruct chunk = { malloc(1), 0 };

You might also like

void download_file(const char url, const char filename) {

void find_and_download_pdfs(const char html, const char base_url) {