Python Script for Book Error Detection

This Python script crawls textbook chapters on a website to check for errors, summarizing the results in an HTML file. It uses functions like get_details() to extract chapter names and links, get_chapter_errors() to check pages for output errors, and error_log_to_html() to convert the error log dictionary into HTML format. The main() function calls these functions to crawl a given textbook URL, check chapters for errors, record the results, and output them as an HTML page.

Uploaded by

Thirumalesh Hadapada Sreenivasa

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

68 views3 pages

Python Script for Book Error Detection

Uploaded by

Thirumalesh Hadapada Sreenivasa

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 3

# Thirumalesh H S

import urllib2
import sys
import os
import webbrowser
from bs4 import BeautifulSoup

def dummy1():
return 1

def get_details(link, index):

""" Crawls through the given link for book or chapter details
Parameters
--------------
link : string
link of the book(.html) or chapter(.html or .ipynb)
index : int
Getting details from book or chapter is set by index value
* index = 1 --> gets details of the book
* index = 0 --> gets details of the chapter
Returns
--------------
details_list : list
list of lists contains names and links of the book or chapter
"""
details_list = []

src = BeautifulSoup(urllib2.urlopen(link))
table = src.find('table')
for row in table.find_all('tr'):
column = row.find_all('td')
name = column[index].a.string.encode('ascii', 'ignore')
link = 'http://tbc-python.fossee.in%s' %(column[index].a.get("href"))
details_list.append([name, link])

return details_list

def get_chapter_errors(chap_link):
""" Crawls through the chapters and finds number of errors
Parameters
--------------
chap_link : string
.ipynb/html link of the chapter
Returns
--------------
error : int or NoneType or Exception
number of errors present in the given chapter link or Page fecth error
if the link is broken
"""
try:
chp_src = BeautifulSoup(urllib2.urlopen(chap_link))
example_errors = chp_src.find_all('div', {'class':
'output_subarea output_text output_error'})
error = len(example_errors)
if not example_errors:
error = None
except urllib2.HTTPError as e:
error = str(e)

return error

def error_log_to_html(error_log):
""" Uses jinja2 template to convert python dictionary into html format.
Data from error_log is converted into formatted html tags, which can be
viewed as a webpage.

Parameters
--------------
error_log: dictionary
contains details of the uploaded example codes which have errors.
Returns
--------------
html: html data
error_log in html format

"""
from jinja2 import Template

temp = """
<html>
<title>Errors in the uploaded python codes in TBC website!!!</title>
<tbody>
<center>
<h2> Python TBC errors </h2>
</center>
{% for key, value in error_log.iteritems() %}
<a href={{ value.url }}>{{ value.name }}</a>
{% for k, v in value.iteritems() %}
{% if k == "chapters" %}
<table>
{% for c in v %}
<tr>
<td>Errors: {{c.errors}} <a
href={{ c.chapter_url }}>{{ c.name }}</a> </td>
</tr>
{% endfor %}
</table>
{% endif %}
{% endfor %}
<br/>
{% endfor %}
</tbody>
</html>
"""

template = Template(temp)
html = template.render(error_log=error_log)

return html

def main():
full_log = {}
error_dict = {}
error_log = {}
url = sys.argv[1] # accept url as argument
book_details_list = get_details(url, index=1)

webbrowser.open('file://' + os.path.realpath("tbc-errors.html"), new=0)

# Get details of the book

for book_name, book_link in book_details_list:
chapter_details_list = get_details(book_link, index=0)
_id = book_link.strip('http://tbc-python.fossee.in/book-details')

chapters = []
# Get details of the chapter
for chap_name, chap_link in chapter_details_list:
error = get_chapter_errors(chap_link)

# If error is present in the chapter, store details in error_log

if error != None:

error_dict = {'name': book_name,

'url': book_link,
}

chapters.append({'name': chap_name,
'errors': error,
'chapter_url': chap_link
})

error_dict['chapters'] = chapters
error_log.update({_id: error_dict})
print 'Error: ', book_name, chap_name

html = error_log_to_html(error_log)
html_file = open("tbc-errors.html","w")
html_file.write(html)
html_file.close()
else:
print 'No errors: ', book_name, chap_name

if __name__ == '__main__':
main()

Web Mining Techniques and Code
No ratings yet
Web Mining Techniques and Code
11 pages
Sans Titre
No ratings yet
Sans Titre
11 pages
Web Crawler with Robots.txt Handling
No ratings yet
Web Crawler with Robots.txt Handling
3 pages
Unit I
No ratings yet
Unit I
12 pages
03 Web Scraping
No ratings yet
03 Web Scraping
41 pages
UI Ex 6 (61) - 1
No ratings yet
UI Ex 6 (61) - 1
3 pages
Lecture 12 - Web Scrapping
No ratings yet
Lecture 12 - Web Scrapping
11 pages
Python v3 URL and Page
No ratings yet
Python v3 URL and Page
4 pages
Python File Handling and Data Processing
No ratings yet
Python File Handling and Data Processing
11 pages
Data Gathering Script Guide
No ratings yet
Data Gathering Script Guide
7 pages
Message 12 3
No ratings yet
Message 12 3
10 pages
Social Media Username Checker
100% (1)
Social Media Username Checker
21 pages
Web Scraping
No ratings yet
Web Scraping
11 pages
Python Web Mining Crawler & Encoding Techniques
No ratings yet
Python Web Mining Crawler & Encoding Techniques
27 pages
Message
No ratings yet
Message
3 pages
Message
No ratings yet
Message
3 pages
Web Mining Lab Source Code 1-12 PRINT
No ratings yet
Web Mining Lab Source Code 1-12 PRINT
43 pages
Tool
No ratings yet
Tool
3 pages
Practical Introduction To Web Scraping in Python
100% (1)
Practical Introduction To Web Scraping in Python
14 pages
Python Pentesting Techniques Guide
No ratings yet
Python Pentesting Techniques Guide
1 page
New Text Document
No ratings yet
New Text Document
3 pages
Error
No ratings yet
Error
11 pages
Sfcli 33
No ratings yet
Sfcli 33
26 pages
Subdomain Scanner Script
No ratings yet
Subdomain Scanner Script
2 pages
Logging & Debugging Python Tricks
No ratings yet
Logging & Debugging Python Tricks
22 pages
Web Mining Practical File (NS)
No ratings yet
Web Mining Practical File (NS)
15 pages
CSS Exp 08
No ratings yet
CSS Exp 08
4 pages
Mgeko - Lua 3
No ratings yet
Mgeko - Lua 3
3 pages
Beautiful Soup 4 Documentation Guide
100% (1)
Beautiful Soup 4 Documentation Guide
56 pages
Python AI Prompts and Code Examples
No ratings yet
Python AI Prompts and Code Examples
9 pages
Trip Planning Tool with AI Agents
No ratings yet
Trip Planning Tool with AI Agents
7 pages
Parser
No ratings yet
Parser
6 pages
Python String and Link Extraction Guide
No ratings yet
Python String and Link Extraction Guide
3 pages
Web Scraper-Document
No ratings yet
Web Scraper-Document
2 pages
Beautiful Soup
No ratings yet
Beautiful Soup
61 pages
Web Scraping with Python and urllib
100% (1)
Web Scraping with Python and urllib
57 pages
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
4 pages
.txt
No ratings yet
.txt
2 pages
Gitlab Agent
No ratings yet
Gitlab Agent
7 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
AIML Manual Lab-For Students
No ratings yet
AIML Manual Lab-For Students
45 pages
卂几ㄖ几ㄚ
No ratings yet
卂几ㄖ几ㄚ
8 pages
Beautiful Soup Documentation - Beautiful Soup 4.13.0 Documentation
No ratings yet
Beautiful Soup Documentation - Beautiful Soup 4.13.0 Documentation
54 pages
Domain SQLi Finder - Py
No ratings yet
Domain SQLi Finder - Py
13 pages
Hindu Customized Recipe
No ratings yet
Hindu Customized Recipe
2 pages
Web Scraping with Beautiful Soup Guide
No ratings yet
Web Scraping with Beautiful Soup Guide
11 pages
Another Hack Test14
No ratings yet
Another Hack Test14
2 pages
Grey Meta Final Safe
No ratings yet
Grey Meta Final Safe
11 pages
Assignment 1
No ratings yet
Assignment 1
5 pages
Beautiful Soup: Python HTML/XML Parsing
No ratings yet
Beautiful Soup: Python HTML/XML Parsing
40 pages
KM (649-1 5)
No ratings yet
KM (649-1 5)
2 pages
Git Filter Repo
No ratings yet
Git Filter Repo
69 pages
PDF 24
No ratings yet
PDF 24
7 pages
Building Knowledge Graphs From ML Case Studies
No ratings yet
Building Knowledge Graphs From ML Case Studies
2 pages
Web Data Analytics: Instructions
No ratings yet
Web Data Analytics: Instructions
2 pages
Beautiful Soup 4 Documentation Guide
No ratings yet
Beautiful Soup 4 Documentation Guide
61 pages
Web Scraping & Graphs for Developers
No ratings yet
Web Scraping & Graphs for Developers
26 pages
Furqan Hassan
No ratings yet
Furqan Hassan
1 page
8 Series Chipset PCH Datasheet
No ratings yet
8 Series Chipset PCH Datasheet
992 pages
Android System Shutdown Logs
No ratings yet
Android System Shutdown Logs
5 pages
Online Class 15-Infix, Prefins, Postfix
100% (1)
Online Class 15-Infix, Prefins, Postfix
62 pages
1Sec-Computer (En) - Revision For Octobar Test
No ratings yet
1Sec-Computer (En) - Revision For Octobar Test
2 pages
Veeam Backup 9 5 Evaluators Guide Vsphere en
No ratings yet
Veeam Backup 9 5 Evaluators Guide Vsphere en
130 pages
B Tech Python
No ratings yet
B Tech Python
7 pages
Digital System Design Jan 2018 (2015 Scheme)
No ratings yet
Digital System Design Jan 2018 (2015 Scheme)
2 pages
Computer Architecture Lab4
No ratings yet
Computer Architecture Lab4
4 pages
MCQ in C
No ratings yet
MCQ in C
47 pages
Wireless USB Technology Overview
No ratings yet
Wireless USB Technology Overview
12 pages
Cross Pad
No ratings yet
Cross Pad
35 pages
IBM TS7700 Market Competitive Compare Seller Presentation L3 - 2023-Dec-06
No ratings yet
IBM TS7700 Market Competitive Compare Seller Presentation L3 - 2023-Dec-06
18 pages
ReleaseNote - FileList of GA502IU - 19H2 - 64 - V3.02
No ratings yet
ReleaseNote - FileList of GA502IU - 19H2 - 64 - V3.02
5 pages
Smart Linx PDF
No ratings yet
Smart Linx PDF
47 pages
IT Disaster Recovery Plan - Ready - Gov
No ratings yet
IT Disaster Recovery Plan - Ready - Gov
7 pages
Institutional Assessment - Common Competency - Web Development
No ratings yet
Institutional Assessment - Common Competency - Web Development
8 pages
REV0.0.3 CYQGWW - GS CarbonV3 Pinout
No ratings yet
REV0.0.3 CYQGWW - GS CarbonV3 Pinout
1 page
SRDF Interfamily Connectivity Information
No ratings yet
SRDF Interfamily Connectivity Information
15 pages
مقسم 20230102 2356
No ratings yet
مقسم 20230102 2356
35 pages
NASEM Dairy-8 FAQ
No ratings yet
NASEM Dairy-8 FAQ
6 pages
Kubernetes Deploy Mysql Spring Rest Api React Native App Instructions
No ratings yet
Kubernetes Deploy Mysql Spring Rest Api React Native App Instructions
7 pages
HPE iLO & AWS Kinesis Setup Guide
No ratings yet
HPE iLO & AWS Kinesis Setup Guide
6 pages
Types of Debuggers
No ratings yet
Types of Debuggers
14 pages
NORDCON User Manual Overview
No ratings yet
NORDCON User Manual Overview
240 pages
Pe 1998 11
No ratings yet
Pe 1998 11
92 pages
Cse211 Operating-Systems TH 2.00 Sc04
No ratings yet
Cse211 Operating-Systems TH 2.00 Sc04
1 page
ATA/ATAPI-7 Amendment Details
No ratings yet
ATA/ATAPI-7 Amendment Details
4 pages
HITB AMS 2017 Blue Picking - Hacking Bluetooth Smart Locks PDF
100% (1)
HITB AMS 2017 Blue Picking - Hacking Bluetooth Smart Locks PDF
228 pages
Jupyter Notebook For Beginners
100% (2)
Jupyter Notebook For Beginners
23 pages