# Complete Manga/Comic Text Removal Project
# Advanced solution with multiple detection methods and inpainting techniques
# Fully compatible with Google Colab
import os
import sys
import cv2
import numpy as np
import [Link] as plt
from PIL import Image, ImageDraw, ImageFilter, ImageEnhance
import torch
import [Link] as transforms
from pathlib import Path
import requests
import zipfile
import gdown
from typing import List, Tuple, Optional, Dict
import warnings
import json
import time
from tqdm import tqdm
import gc
[Link]('ignore')
# ======================= INSTALLATION SETUP =======================
def install_all_dependencies():
"""Complete dependency installation for Google Colab"""
print("Installing all required packages... This may take a few minutes.")
# Core packages
packages = [
"torch torchvision torchaudio --index-url
[Link]
"opencv-python-headless",
"pillow>=9.0.0",
"numpy>=1.21.0",
"matplotlib>=3.5.0",
"tqdm",
"scipy",
"scikit-image",
"scikit-learn"
]
# OCR packages
ocr_packages = [
"easyocr",
"paddlepaddle-gpu" if [Link].is_available() else "paddlepaddle",
"paddleocr>=2.6.0"
]
# AI/ML packages
ai_packages = [
"transformers>=4.20.0",
"diffusers>=0.21.0",
"accelerate>=0.20.0",
"controlnet-aux",
"xformers" if [Link].is_available() else "",
"segment-anything",
"ultralytics>=8.0.0"
]
# Additional utilities
util_packages = [
"imageio",
"imageio-ffmpeg",
"gradio",
"ipywidgets"
]
all_packages = packages + ocr_packages + ai_packages + util_packages
for package in all_packages:
if package: # Skip empty strings
try:
print(f"Installing {package}...")
[Link](f"pip install -q {package}")
except Exception as e:
print(f"Warning: Could not install {package}: {e}")
# Additional setup for specific packages
try:
import nltk
[Link]('punkt', quiet=True)
except:
pass
print("✅ All dependencies installed successfully!")
# ======================= ADVANCED TEXT DETECTION =======================
class AdvancedTextDetector:
"""Multi-method text detection with manga/comic specialization"""
def __init__(self):
self.setup_all_detectors()
self.detection_cache = {}
def setup_all_detectors(self):
"""Initialize all available text detection methods"""
print("🔧 Setting up text detection models...")
# OCR Readers
[Link] = {}
# EasyOCR setup
try:
import easyocr
[Link]['easyocr'] = [Link](
['en', 'ja', 'ko', 'zh', 'th', 'vi'],
gpu=[Link].is_available()
)
print("✅ EasyOCR initialized")
except Exception as e:
print(f"⚠️ EasyOCR failed: {e}")
# PaddleOCR setup
try:
from paddleocr import PaddleOCR
[Link]['paddle_en'] = PaddleOCR(
use_angle_cls=True,
lang='en',
show_log=False,
use_gpu=[Link].is_available()
)
[Link]['paddle_ch'] = PaddleOCR(
use_angle_cls=True,
lang='ch',
show_log=False,
use_gpu=[Link].is_available()
)
print("✅ PaddleOCR initialized")
except Exception as e:
print(f"⚠️ PaddleOCR failed: {e}")
# CRAFT Text Detection (if available)
try:
self.setup_craft_detector()
except:
print("⚠️ CRAFT detector not available")
# OpenCV-based detectors
self.setup_opencv_detectors()
print(f"✅ Text detection setup complete! Available methods:
{list([Link]())}")
def setup_craft_detector(self):
"""Setup CRAFT text detector for better comic text detection"""
try:
# Download CRAFT model if not exists
craft_path = "/content/craft_mlt_25k.pth"
if not [Link](craft_path):
print("Downloading CRAFT model...")
url =
"[Link]
[Link](f"wget -q {url} -O {craft_path}")
# Note: Full CRAFT implementation would go here
# For now, we'll use a placeholder
[Link]['craft'] = None
except Exception as e:
print(f"CRAFT setup failed: {e}")
def setup_opencv_detectors(self):
"""Setup OpenCV-based text detection methods"""
# EAST Text Detector
try:
east_path = "/content/frozen_east_text_detection.pb"
if not [Link](east_path):
print("Downloading EAST model...")
url =
"[Link]
[Link]"
[Link](f"wget -q {url} -O {east_path}")
[Link]['east'] = [Link](east_path)
print("✅ EAST detector initialized")
except Exception as e:
print(f"⚠️ EAST detector failed: {e}")
def detect_text_comprehensive(self, image: [Link],
min_confidence: float = 0.3) -> List[Dict]:
"""
Comprehensive text detection using all available methods
Returns:
List of detection dictionaries with bbox, confidence, method, text
"""
results = []
# Method 1: EasyOCR
if 'easyocr' in [Link]:
[Link](self._detect_with_easyocr(image, min_confidence))
# Method 2: PaddleOCR
if 'paddle_en' in [Link]:
[Link](self._detect_with_paddle(image, min_confidence))
# Method 3: EAST
if 'east' in [Link]:
[Link](self._detect_with_east(image, min_confidence))
# Method 4: OpenCV methods
[Link](self._detect_with_opencv(image, min_confidence))
# Method 5: Manga-specific detection
[Link](self._detect_manga_specific(image, min_confidence))
# Merge and filter results
merged_results = self._merge_detections(results)
return merged_results
def _detect_with_easyocr(self, image: [Link], min_confidence: float) ->
List[Dict]:
"""EasyOCR detection"""
results = []
try:
detections = [Link]['easyocr'].readtext(image)
for bbox, text, confidence in detections:
if confidence >= min_confidence:
bbox_array = [Link](bbox, dtype=np.int32)
x_min, y_min = [Link](bbox_array, axis=0)
x_max, y_max = [Link](bbox_array, axis=0)
[Link]({
'bbox': (x_min, y_min, x_max, y_max),
'confidence': confidence,
'method': 'easyocr',
'text': text,
'polygon': bbox
})
except Exception as e:
print(f"EasyOCR detection error: {e}")
return results
def _detect_with_paddle(self, image: [Link], min_confidence: float) ->
List[Dict]:
"""PaddleOCR detection"""
results = []
for lang in ['paddle_en', 'paddle_ch']:
if lang not in [Link]:
continue
try:
ocr_results = [Link][lang].ocr(image, cls=True)
if ocr_results and ocr_results[0]:
for item in ocr_results[0]:
bbox, (text, confidence) = item
if confidence >= min_confidence:
bbox_array = [Link](bbox, dtype=np.int32)
x_min, y_min = [Link](bbox_array, axis=0)
x_max, y_max = [Link](bbox_array, axis=0)
[Link]({
'bbox': (x_min, y_min, x_max, y_max),
'confidence': confidence,
'method': lang,
'text': text,
'polygon': bbox
})
except Exception as e:
print(f"{lang} detection error: {e}")
return results
def _detect_with_east(self, image: [Link], min_confidence: float) ->
List[Dict]:
"""EAST detector"""
results = []
try:
if 'east' not in [Link]:
return results
net = [Link]['east']
height, width = [Link][:2]
# Prepare image for EAST
new_height, new_width = 320, 320
ratio_h, ratio_w = height / new_height, width / new_width
blob = [Link](image, 1.0, (new_width, new_height),
(123.68, 116.78, 103.94), swapRB=True,
crop=False)
[Link](blob)
scores, geometry = [Link](['feature_fusion/Conv_7/Sigmoid',
'feature_fusion/concat_3'])
# Decode predictions
boxes, confidences = self._decode_east_predictions(scores, geometry,
min_confidence)
# Apply NMS
indices = [Link](boxes, confidences, min_confidence, 0.4)
if len(indices) > 0:
for i in [Link]():
x, y, w, h = boxes[i]
# Scale back to original image
x = int(x * ratio_w)
y = int(y * ratio_h)
w = int(w * ratio_w)
h = int(h * ratio_h)
[Link]({
'bbox': (x, y, x + w, y + h),
'confidence': confidences[i],
'method': 'east',
'text': '',
'polygon': [(x, y), (x + w, y), (x + w, y + h), (x, y + h)]
})
except Exception as e:
print(f"EAST detection error: {e}")
return results
def _decode_east_predictions(self, scores, geometry, min_confidence):
"""Decode EAST model predictions"""
boxes = []
confidences = []
height, width = [Link][2:4]
for y in range(height):
scores_data = scores[0, 0, y]
x_data0 = geometry[0, 0, y]
x_data1 = geometry[0, 1, y]
x_data2 = geometry[0, 2, y]
x_data3 = geometry[0, 3, y]
angles_data = geometry[0, 4, y]
for x in range(width):
if scores_data[x] < min_confidence:
continue
offset_x, offset_y = x * 4.0, y * 4.0
angle = angles_data[x]
cos = [Link](angle)
sin = [Link](angle)
h = x_data0[x] + x_data2[x]
w = x_data1[x] + x_data3[x]
end_x = int(offset_x + (cos * x_data1[x]) + (sin * x_data2[x]))
end_y = int(offset_y - (sin * x_data1[x]) + (cos * x_data2[x]))
start_x = int(end_x - w)
start_y = int(end_y - h)
[Link]([start_x, start_y, int(w), int(h)])
[Link](float(scores_data[x]))
return boxes, confidences
def _detect_with_opencv(self, image: [Link], min_confidence: float) ->
List[Dict]:
"""OpenCV-based text detection methods"""
results = []
try:
gray = [Link](image, cv2.COLOR_BGR2GRAY) if len([Link]) == 3
else image
# Method 1: MSER (Maximally Stable Extremal Regions)
mser = cv2.MSER_create(
_delta=2,
_min_area=30,
_max_area=8000,
_max_variation=0.25,
_min_diversity=0.2,
_max_evolution=200,
_area_threshold=1.01,
_min_margin=0.003,
_edge_blur_size=5
)
regions, _ = [Link](gray)
for region in regions:
if len(region) > 10:
x, y, w, h = [Link](region)
aspect_ratio = w / h if h > 0 else 0
area = w * h
if (0.1 < aspect_ratio < 20 and 100 < area < 10000 and
w > 15 and h > 8):
[Link]({
'bbox': (x, y, x + w, y + h),
'confidence': 0.6,
'method': 'mser',
'text': '',
'polygon': [(x, y), (x + w, y), (x + w, y + h), (x, y +
h)]
})
# Method 2: Contour-based detection
# Apply multiple preprocessing techniques
preprocessed = [
[Link](gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2),
[Link](gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY_INV, 15, 4),
[Link](gray, 0, 255, cv2.THRESH_BINARY_INV +
cv2.THRESH_OTSU)[1]
]
for thresh in preprocessed:
# Morphological operations
kernel = [Link](cv2.MORPH_RECT, (3, 3))
processed = [Link](thresh, cv2.MORPH_CLOSE, kernel)
contours, _ = [Link](processed, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
area = [Link](contour)
if 50 < area < 5000:
x, y, w, h = [Link](contour)
aspect_ratio = w / h if h > 0 else 0
if 0.2 < aspect_ratio < 15 and w > 10 and h > 8:
[Link]({
'bbox': (x, y, x + w, y + h),
'confidence': 0.5,
'method': 'contour',
'text': '',
'polygon': [(x, y), (x + w, y), (x + w, y + h), (x,
y + h)]
})
except Exception as e:
print(f"OpenCV detection error: {e}")
return results
def _detect_manga_specific(self, image: [Link], min_confidence: float) ->
List[Dict]:
"""Manga/comic specific text detection"""
results = []
try:
gray = [Link](image, cv2.COLOR_BGR2GRAY) if len([Link]) == 3
else image
# Speech bubble detection
[Link](self._detect_speech_bubbles(gray))
# Sound effect detection (often has different characteristics)
[Link](self._detect_sound_effects(gray))
# Handwritten text detection
[Link](self._detect_handwritten_text(gray))
except Exception as e:
print(f"Manga-specific detection error: {e}")
return results
def _detect_speech_bubbles(self, gray: [Link]) -> List[Dict]:
"""Detect speech bubbles and text within them"""
results = []
try:
# Use HoughCircles to detect circular/oval speech bubbles
circles = [Link](gray, cv2.HOUGH_GRADIENT, 1, 50,
param1=50, param2=30, minRadius=20,
maxRadius=200)
if circles is not None:
circles = [Link](circles[0, :]).astype("int")
for (x, y, r) in circles:
# Create bounding box around circle
bbox = (max(0, x - r), max(0, y - r),
min([Link][1], x + r), min([Link][0], y + r))
[Link]({
'bbox': bbox,
'confidence': 0.4,
'method': 'speech_bubble',
'text': '',
'polygon': [(bbox[0], bbox[1]), (bbox[2], bbox[1]),
(bbox[2], bbox[3]), (bbox[0], bbox[3])]
})
# Detect rectangular speech bubbles
# Apply edge detection
edges = [Link](gray, 50, 150, apertureSize=3)
kernel = [Link]((3, 3), np.uint8)
edges = [Link](edges, kernel, iterations=1)
contours, _ = [Link](edges, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
area = [Link](contour)
if 500 < area < 20000: # Size filter for speech bubbles
# Approximate contour
epsilon = 0.02 * [Link](contour, True)
approx = [Link](contour, epsilon, True)
if len(approx) >= 4: # Roughly rectangular
x, y, w, h = [Link](contour)
aspect_ratio = w / h if h > 0 else 0
if 0.3 < aspect_ratio < 5: # Reasonable aspect ratio
[Link]({
'bbox': (x, y, x + w, y + h),
'confidence': 0.5,
'method': 'rect_bubble',
'text': '',
'polygon': [(x, y), (x + w, y), (x + w, y + h), (x,
y + h)]
})
except Exception as e:
print(f"Speech bubble detection error: {e}")
return results
def _detect_sound_effects(self, gray: [Link]) -> List[Dict]:
"""Detect sound effects text (often stylized)"""
results = []
try:
# Sound effects often have bold, stylized text
# Use different morphological operations
kernel_large = [Link](cv2.MORPH_ELLIPSE, (7, 7))
kernel_small = [Link](cv2.MORPH_ELLIPSE, (3, 3))
# Apply tophat transform to detect bright text on dark background
tophat = [Link](gray, cv2.MORPH_TOPHAT, kernel_large)
# Apply blackhat transform to detect dark text on bright background
blackhat = [Link](gray, cv2.MORPH_BLACKHAT, kernel_large)
# Combine both
combined = [Link](tophat, blackhat)
# Threshold
_, thresh = [Link](combined, 10, 255, cv2.THRESH_BINARY)
# Find contours
contours, _ = [Link](thresh, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
area = [Link](contour)
if 100 < area < 8000:
x, y, w, h = [Link](contour)
aspect_ratio = w / h if h > 0 else 0
# Sound effects can have more varied aspect ratios
if 0.1 < aspect_ratio < 20 and w > 20 and h > 15:
[Link]({
'bbox': (x, y, x + w, y + h),
'confidence': 0.4,
'method': 'sound_effect',
'text': '',
'polygon': [(x, y), (x + w, y), (x + w, y + h), (x, y +
h)]
})
except Exception as e:
print(f"Sound effect detection error: {e}")
return results
def _detect_handwritten_text(self, gray: [Link]) -> List[Dict]:
"""Detect handwritten text areas"""
results = []
try:
# Handwritten text often has more irregular patterns
# Use gradient-based detection
grad_x = [Link](gray, cv2.CV_64F, 1, 0, ksize=3)
grad_y = [Link](gray, cv2.CV_64F, 0, 1, ksize=3)
magnitude = [Link](grad_x**2 + grad_y**2)
magnitude = np.uint8(magnitude / [Link]() * 255)
# Apply threshold
_, thresh = [Link](magnitude, 30, 255, cv2.THRESH_BINARY)
# Morphological operations
kernel = [Link](cv2.MORPH_RECT, (5, 5))
thresh = [Link](thresh, cv2.MORPH_CLOSE, kernel)
contours, _ = [Link](thresh, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
area = [Link](contour)
if 200 < area < 5000:
x, y, w, h = [Link](contour)
aspect_ratio = w / h if h > 0 else 0
if 0.3 < aspect_ratio < 8 and w > 25 and h > 15:
[Link]({
'bbox': (x, y, x + w, y + h),
'confidence': 0.35,
'method': 'handwritten',
'text': '',
'polygon': [(x, y), (x + w, y), (x + w, y + h), (x, y +
h)]
})
except Exception as e:
print(f"Handwritten text detection error: {e}")
return results
def _merge_detections(self, detections: List[Dict]) -> List[Dict]:
"""Merge overlapping detections from different methods"""
if not detections:
return []
# Sort by confidence
[Link](key=lambda x: x['confidence'], reverse=True)
merged = []
used = set()
for i, detection in enumerate(detections):
if i in used:
continue
current = [Link]()
current_bbox = detection['bbox']
# Find overlapping detections
overlaps = []
for j, other in enumerate(detections[i+1:], i+1):
if j in used:
continue
iou = self._calculate_iou(current_bbox, other['bbox'])
if iou > 0.3: # Overlap threshold
[Link](j)
# Merge overlapping detections
if overlaps:
all_bboxes = [current_bbox] + [detections[j]['bbox'] for j in
overlaps]
merged_bbox = self._merge_bboxes(all_bboxes)
current['bbox'] = merged_bbox
# Update polygon
x1, y1, x2, y2 = merged_bbox
current['polygon'] = [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]
# Combine methods
methods = [current['method']] + [detections[j]['method'] for j in
overlaps]
current['method'] = '+'.join(set(methods))
# Use highest confidence
confidences = [current['confidence']] + [detections[j]
['confidence'] for j in overlaps]
current['confidence'] = max(confidences)
# Mark as used
[Link](overlaps)
[Link](current)
[Link](i)
return merged
def _calculate_iou(self, bbox1: Tuple, bbox2: Tuple) -> float:
"""Calculate Intersection over Union of two bounding boxes"""
x1_1, y1_1, x2_1, y2_1 = bbox1
x1_2, y1_2, x2_2, y2_2 = bbox2
# Calculate intersection
x1_int = max(x1_1, x1_2)
y1_int = max(y1_1, y1_2)
x2_int = min(x2_1, x2_2)
y2_int = min(y2_1, y2_2)
if x2_int <= x1_int or y2_int <= y1_int:
return 0.0
intersection = (x2_int - x1_int) * (y2_int - y1_int)
# Calculate union
area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0.0
def _merge_bboxes(self, bboxes: List[Tuple]) -> Tuple:
"""Merge multiple bounding boxes into one"""
x1_min = min(bbox[0] for bbox in bboxes)
y1_min = min(bbox[1] for bbox in bboxes)
x2_max = max(bbox[2] for bbox in bboxes)
y2_max = max(bbox[3] for bbox in bboxes)
return (x1_min, y1_min, x2_max, y2_max)
# ======================= ADVANCED INPAINTING =======================
class AdvancedInpainter:
"""Multi-method inpainting with quality optimization"""
def __init__(self):
self.setup_inpainting_models()
[Link] = [Link]("cuda" if [Link].is_available() else "cpu")
def setup_inpainting_models(self):
"""Setup all available inpainting methods"""
print("🔧 Setting up inpainting models...")
[Link] = {}
# Stable Diffusion Inpainting
try:
from diffusers import StableDiffusionInpaintPipeline, DiffusionPipeline
model_id = "runwayml/stable-diffusion-inpainting"
[Link]['sd'] = StableDiffusionInpaintPipeline.from_pretrained(
model_id,
torch_dtype=torch.float16 if [Link].is_available() else
torch.float32,
variant="fp16" if [Link].is_available() else None,
use_safetensors=True
).to([Link])
# Enable optimizations
if [Link].is_available():
[Link]['sd'].enable_attention_slicing()
[Link]['sd'].enable_model_cpu_offload()
try:
[Link]['sd'].enable_xformers_memory_efficient_attention()
except:
pass
print("✅ Stable Diffusion inpainting loaded")
except Exception as e:
print(f"⚠️ Stable Diffusion loading failed: {e}")
# MAT (Mask-Aware Transformer) - if available
try:
self.setup_mat_inpainter()
except:
print("⚠️ MAT inpainter not available")
# LaMa (Large Mask Inpainting) - if available
try:
self.setup_lama_inpainter()
except:
print("⚠️ LaMa inpainter not available")
print(f"✅