> ## Documentation Index > Fetch the complete documentation index at: https://docs.loadforge.com/llms.txt > Use this file to discover all available pages before exploring further. # Broken Links Checker > Crawl website and detect broken internal/external links with detailed reporting This guide demonstrates how to create a comprehensive broken links checker with LoadForge, crawling through all pages and detecting various types of link issues. ## Use Cases * Detecting 404 broken internal and external links * Finding redirect chains and infinite loops * Validating anchor links and fragments * Monitoring external link health over time * SEO link validation and reporting ## Simplified Internal Links Checker ```python theme={null} from locust import HttpUser, task, between import json import time import re from urllib.parse import urljoin, urlparse from collections import deque import logging class InternalLinksChecker(HttpUser): wait_time = between(0.5, 1) def on_start(self): """Initialize internal links checking""" self.visited_pages = set() self.pages_to_check = deque() self.broken_links = [] self.total_links_checked = 0 self.base_domain = None # Setup logging logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__name__) # Start with homepage self._initialize_crawler() def _initialize_crawler(self): """Initialize the crawler with the homepage""" try: homepage_response = self.client.get('/', name="INIT: Homepage Check") if homepage_response.status_code == 200: self.base_domain = urlparse(self.client.base_url).netloc self.pages_to_check.append('/') self.logger.info(f"Starting internal links check for domain: {self.base_domain}") else: self.logger.error(f"Failed to access homepage: {homepage_response.status_code}") except Exception as e: self.logger.error(f"Error initializing crawler: {str(e)}") @task(10) def crawl_internal_pages(self): """Main crawling task - only checks internal links""" if not self.pages_to_check: return current_page = self.pages_to_check.popleft() if current_page in self.visited_pages: return self.visited_pages.add(current_page) try: response = self.client.get(current_page, name=f"CRAWL: {current_page}") if response.status_code == 200: self._extract_and_validate_internal_links(current_page, response.text) else: self._log_broken_link(current_page, current_page, response.status_code, f"Page not accessible: {response.status_code}") except Exception as e: self._log_broken_link(current_page, current_page, 0, f"Exception accessing page: {str(e)}") def _extract_and_validate_internal_links(self, source_page, html_content): """Extract and validate only internal page links (URLs only, no resources)""" # Only look for tags (actual page links) link_pattern = r']+href=["\']([^"\']+)["\'][^>]*>' # Find all tag links all_links = re.findall(link_pattern, html_content, re.IGNORECASE) # Filter to only internal page links and validate them for link in all_links: if self._is_internal_link(link) and not self._should_skip_link(link): # Only validate if it's a page link (not a resource) if self._is_page_link(link): self._validate_internal_link(source_page, link) # Add to crawl queue normalized_link = self._normalize_url(link) if normalized_link and normalized_link not in self.visited_pages: if normalized_link not in self.pages_to_check: self.pages_to_check.append(normalized_link) def _is_internal_link(self, link): """Check if a link is internal (same domain or relative)""" # Skip anchors, external protocols if link.startswith('#') or link.startswith('mailto:') or link.startswith('tel:'): return False # Relative paths are internal if link.startswith('/') or not link.startswith('http'): return True # Check if absolute URL is same domain if link.startswith('http'): return urlparse(link).netloc == self.base_domain return True def _is_page_link(self, link): """Check if link is a page (not a resource like image/css/js)""" # Skip common resource file extensions resource_extensions = {'.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.ico', '.pdf', '.zip', '.mp4', '.mp3', '.woff', '.woff2', '.ttf', '.eot', '.webp', '.avif'} parsed_link = urlparse(link.lower()) path = parsed_link.path # Check if it ends with a resource extension for ext in resource_extensions: if path.endswith(ext): return False # Always consider these as pages (common blog/article patterns) page_patterns = [ r'/blog/', r'/articles?/', r'/posts?/', r'/news/', r'/(20\d{2})/', # Year pattern (2020, 2021, etc.) r'/category/', r'/tag/', ] for pattern in page_patterns: if re.search(pattern, path): return True return True def _should_skip_link(self, link): """Determine if a link should be skipped entirely""" skip_patterns = [ r'^mailto:', r'^tel:', r'^javascript:', r'^data:', r'^#$', # Empty anchor r'^\s*$', # Empty or whitespace ] for pattern in skip_patterns: if re.match(pattern, link, re.IGNORECASE): return True return False def _validate_internal_link(self, source_page, link): """Validate an internal link and log any issues""" self.total_links_checked += 1 normalized_url = self._normalize_url(link) if not normalized_url: self._log_broken_link(source_page, link, 0, "Invalid URL format") return try: with self.client.get(normalized_url, name=f"LINK: {normalized_url}", catch_response=True) as response: if response.status_code == 404: self._log_broken_link(source_page, link, 404, "Page not found") response.failure("❌ 404 Not Found") elif response.status_code >= 500: self._log_broken_link(source_page, link, response.status_code, "Server error") response.failure(f"❌ Server Error {response.status_code}") elif response.status_code >= 400: self._log_broken_link(source_page, link, response.status_code, "Client error") response.failure(f"❌ Client Error {response.status_code}") else: response.success() except Exception as e: self._log_broken_link(source_page, link, 0, f"Request failed: {str(e)}") def _log_broken_link(self, source_page, broken_link, status_code, reason): """Log broken link with detailed information""" broken_link_info = { 'source_page': source_page, 'broken_link': broken_link, 'status_code': status_code, 'reason': reason, 'timestamp': time.time() } self.broken_links.append(broken_link_info) # Log to console and LoadForge error_msg = f"BROKEN LINK: {broken_link} (Status: {status_code}) found on page: {source_page} - {reason}" self.logger.error(error_msg) print(error_msg) def _normalize_url(self, link): """Normalize URL for consistent checking""" try: # Handle relative paths if link.startswith('/'): return link elif link.startswith('http'): parsed = urlparse(link) if parsed.netloc == self.base_domain: return parsed.path + ('?' + parsed.query if parsed.query else '') return None # External link, skip else: # Relative link - normalize to absolute path return '/' + link.lstrip('./') except Exception: return None @task(1) def report_status(self): """Print status to console (no API submission needed)""" if len(self.visited_pages) < 3: # Wait until we have some data return print(f"STATUS: {len(self.visited_pages)} pages crawled, " f"{self.total_links_checked} links checked, " f"{len(self.broken_links)} broken links found") def on_stop(self): """Final summary when test completes""" print("\n" + "="*50) print("INTERNAL LINKS CHECK COMPLETE") print("="*50) print(f"Pages crawled: {len(self.visited_pages)}") print(f"Total links checked: {self.total_links_checked}") print(f"Broken links found: {len(self.broken_links)}") if self.broken_links: print(f"\nBROKEN LINKS FOUND ({len(self.broken_links)}):") print("-" * 40) for link_info in self.broken_links: print(f"❌ {link_info['broken_link']} (HTTP {link_info['status_code']})") print(f" Found on: {link_info['source_page']}") print(f" Reason: {link_info['reason']}") print() else: print("✅ No broken links found!") ``` ## Comprehensive Broken Links Checker ```python theme={null} from locust import HttpUser, task, between import json import time import random import re from urllib.parse import urljoin, urlparse, urlunparse from collections import defaultdict, deque import requests class BrokenLinksChecker(HttpUser): wait_time = between(0.5, 2) def on_start(self): """Initialize broken links checking""" self.visited_pages = set() self.pages_to_check = deque() self.broken_links = [] self.redirect_chains = [] self.external_links = {} self.anchor_links = {} self.link_stats = defaultdict(int) self.base_domain = None # Start with homepage self._initialize_crawler() def _initialize_crawler(self): """Initialize the crawler with the homepage""" homepage_response = self.client.get('/', name="homepage_initial_check") if homepage_response.status_code == 200: self.base_domain = urlparse(self.client.base_url).netloc self.pages_to_check.append('/') print(f"Starting broken links check for domain: {self.base_domain}") else: print(f"Failed to access homepage: {homepage_response.status_code}") @task(5) def crawl_and_check_links(self): """Main crawling task to check links on pages""" if not self.pages_to_check: return current_page = self.pages_to_check.popleft() if current_page in self.visited_pages: return self.visited_pages.add(current_page) # Get the page content response = self.client.get(current_page, name="crawl_page_for_links") if response.status_code == 200: self._extract_and_validate_links(current_page, response.text) else: self._record_broken_link(current_page, response.status_code, "Page not accessible") def _extract_and_validate_links(self, page_url, html_content): """Extract all links from HTML and validate them""" # Find all links in the HTML link_patterns = [ r']+href=["\']([^"\']+)["\'][^>]*>', # r']+href=["\']([^"\']+)["\'][^>]*>', # r']+src=["\']([^"\']+)["\'][^>]*>', #

r']+src=["\']([^"\']+)["\'][^>]*>', #