Bulk URL Checker with uv: Validate Website Accessibility in Python
Learn how to build a powerful URL checker script using uv that validates multiple websites concurrently, detects broken links, and generates detailed reports.
Broken links suck. They annoy users, hurt your SEO, and make you look unprofessional. I wrote this script to check hundreds of URLs at once because manually clicking through links is a waste of time.
This tool checks URLs in parallel, categorizes what went wrong, and saves the broken ones to a file. I use it for auditing sites, checking external links, and monitoring APIs. It’s simple but gets the job done.
New to uv?
If you’re new to uv or want to learn how to set up full Python projects, start with our comprehensive guide Getting Started with uv: Setting Up Your Python Project in 2025 before diving into this advanced script.
What This Script Does
- Checks multiple URLs at once: Uses ThreadPoolExecutor to run requests in parallel
- Fixes URLs without protocols: Automatically adds HTTPS if missing
- Catches different error types: Timeouts, connection errors, HTTP errors
- Shows response times: See how fast each URL responds
- Reads from files: Load URLs from a text file
- Saves broken links: Writes problematic URLs to a file for review
- Shows progress: Real-time counter while checking
The Script
Save this as url_checker.py:
#!/usr/bin/env -S uv run
# /// script
# dependencies = [
# "requests",
# ]
# ///
import requests
from urllib.parse import urlparse
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
def check_url(url, timeout=10):
"""
Check if a URL is accessible and return status information.
Args:
url (str): The URL to check
timeout (int): Timeout in seconds (default: 10)
Returns:
dict: Contains url, status, error_type, and response_time
"""
# Add http:// if no scheme is provided
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
start_time = time.time()
try:
response = requests.get(url, timeout=timeout, allow_redirects=True)
response_time = time.time() - start_time
return {
'url': url,
'status': 'OK',
'status_code': response.status_code,
'error_type': None,
'response_time': round(response_time, 2)
}
except requests.exceptions.Timeout:
return {
'url': url,
'status': 'TIMEOUT',
'status_code': None,
'error_type': 'Connection timeout',
'response_time': timeout
}
except requests.exceptions.ConnectionError as e:
return {
'url': url,
'status': 'CONNECTION_ERROR',
'status_code': None,
'error_type': f'Connection error: {str(e)[:100]}...',
'response_time': time.time() - start_time
}
except requests.exceptions.RequestException as e:
return {
'url': url,
'status': 'ERROR',
'status_code': None,
'error_type': f'Request error: {str(e)[:100]}...',
'response_time': time.time() - start_time
}
def read_urls_from_file(filename):
"""Read URLs from a text file, one per line."""
urls = []
try:
with open(filename, 'r', encoding='utf-8') as file:
for line in file:
url = line.strip()
if url and not url.startswith('#'): # Skip empty lines and comments
urls.append(url)
return urls
except FileNotFoundError:
print(f"Error: File '{filename}' not found.")
return []
except Exception as e:
print(f"Error reading file: {e}")
return []
def check_urls_batch(urls, timeout=10, max_workers=10):
"""
Check multiple URLs concurrently.
Args:
urls (list): List of URLs to check
timeout (int): Timeout per request in seconds
max_workers (int): Maximum number of concurrent threads
Returns:
list: List of results for each URL
"""
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future_to_url = {executor.submit(check_url, url, timeout): url for url in urls}
# Process completed tasks
for i, future in enumerate(as_completed(future_to_url), 1):
result = future.result()
results.append(result)
# Progress indicator
print(f"Checked {i}/{len(urls)} URLs: {result['url']} - {result['status']}")
return results
def main():
# Configuration
filename = input("Enter the filename containing URLs (or press Enter for 'urls.txt'): ").strip()
if not filename:
filename = 'urls.txt'
timeout = input("Enter timeout in seconds (or press Enter for 10): ").strip()
timeout = int(timeout) if timeout.isdigit() else 10
print(f"\nReading URLs from '{filename}'...")
urls = read_urls_from_file(filename)
if not urls:
print("No URLs found to check.")
return
print(f"Found {len(urls)} URLs to check.")
print(f"Using timeout: {timeout} seconds")
print("-" * 50)
# Check all URLs
results = check_urls_batch(urls, timeout=timeout)
# Separate problematic URLs
problematic_urls = [r for r in results if r['status'] != 'OK']
working_urls = [r for r in results if r['status'] == 'OK']
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Total URLs checked: {len(results)}")
print(f"Working URLs: {len(working_urls)}")
print(f"Problematic URLs: {len(problematic_urls)}")
if problematic_urls:
print("\n" + "=" * 50)
print("PROBLEMATIC URLs")
print("=" * 50)
# Group by error type
timeout_urls = [r for r in problematic_urls if r['status'] == 'TIMEOUT']
connection_error_urls = [r for r in problematic_urls if r['status'] == 'CONNECTION_ERROR']
other_error_urls = [r for r in problematic_urls if r['status'] == 'ERROR']
if timeout_urls:
print(f"\nTIMEOUT ERRORS ({len(timeout_urls)}):")
for result in timeout_urls:
print(f" - {result['url']}")
if connection_error_urls:
print(f"\nCONNECTION ERRORS ({len(connection_error_urls)}):")
for result in connection_error_urls:
print(f" - {result['url']}")
print(f" Error: {result['error_type']}")
if other_error_urls:
print(f"\nOTHER ERRORS ({len(other_error_urls)}):")
for result in other_error_urls:
print(f" - {result['url']}")
print(f" Error: {result['error_type']}")
# Save problematic URLs to file
with open('problematic_urls.txt', 'w', encoding='utf-8') as f:
f.write("# Problematic URLs found during check\n")
f.write(f"# Checked on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
if timeout_urls:
f.write("# TIMEOUT ERRORS\n")
for result in timeout_urls:
f.write(f"{result['url']}\n")
f.write("\n")
if connection_error_urls:
f.write("# CONNECTION ERRORS\n")
for result in connection_error_urls:
f.write(f"{result['url']}\n")
f.write("\n")
if other_error_urls:
f.write("# OTHER ERRORS\n")
for result in other_error_urls:
f.write(f"{result['url']}\n")
print(f"\nProblematic URLs saved to 'problematic_urls.txt'")
if working_urls:
print(f"\nWORKING URLs ({len(working_urls)}):")
for result in working_urls:
print(f" ✓ {result['url']} (Status: {result['status_code']}, Time: {result['response_time']}s)")
if __name__ == "__main__":
print("URL Connection Checker")
print("=" * 30)
main()
How It Works
| Function | What It Does |
|---|---|
check_url() | Checks one URL, handles errors, times the response |
read_urls_from_file() | Loads URLs from a text file, skips comments and empty lines |
check_urls_batch() | Runs multiple checks in parallel using threads |
main() | Handles user input, runs the checks, prints results |
Error Types
- OK: URL works fine
- TIMEOUT: Took too long to respond
- CONNECTION_ERROR: DNS or connection issues
- ERROR: Other request failures
Running It
With uv, you just run the file. No setup needed.
Quick Start
- Create a URL file (
urls.txt):
# URLs to check
https://www.google.com
https://www.github.com
https://nonexistent-website-12345.com
bitdoze.com
example.com
- Run it:
uv run url_checker.py
- Follow the prompts:
- Press Enter for
urls.txtor type a different filename - Press Enter for 10 second timeout or enter your own
- Press Enter for
Sample Output
URL Connection Checker
==============================
Enter the filename containing URLs (or press Enter for 'urls.txt'):
Enter timeout in seconds (or press Enter for 10):
Reading URLs from 'urls.txt'...
Found 8 URLs to check.
Using timeout: 10 seconds
--------------------------------------------------
Checked 1/8 URLs: https://www.google.com - OK
Checked 2/8 URLs: https://www.github.com - OK
Checked 3/8 URLs: https://www.stackoverflow.com - OK
Checked 4/8 URLs: https://nonexistent-website-12345.com - CONNECTION_ERROR
Checked 5/8 URLs: https://httpstat.us/500 - OK
Checked 6/8 URLs: https://httpstat.us/404 - OK
Checked 7/8 URLs: https://bitdoze.com - OK
Checked 8/8 URLs: https://example.com - OK
==================================================
SUMMARY
==================================================
Total URLs checked: 8
Working URLs: 7
Problematic URLs: 1
==================================================
PROBLEMATIC URLs
==================================================
CONNECTION ERRORS (1):
- https://nonexistent-website-12345.com
Error: Connection error: HTTPSConnectionPool(host='nonexistent-website-12345.com', port=443)...
Problematic URLs saved to 'problematic_urls.txt'
WORKING URLs (7):
✓ https://www.google.com (Status: 200, Time: 0.15s)
✓ https://www.github.com (Status: 200, Time: 0.23s)
✓ https://www.stackoverflow.com (Status: 200, Time: 0.18s)
✓ https://httpstat.us/500 (Status: 500, Time: 1.02s)
✓ https://httpstat.us/404 (Status: 404, Time: 0.98s)
✓ https://bitdoze.com (Status: 200, Time: 0.45s)
✓ https://example.com (Status: 200, Time: 0.32s)
Tips and Tricks
Custom Settings
Run with your own file and timeout:
uv run url_checker.py
# Enter: my_links.txt
# Enter: 5
Organize URL Files
Use separate files for different checks:
APIs (apis.txt):
https://api.github.com
https://httpbin.org/get
Social (social.txt):
https://twitter.com/myhandle
https://linkedin.com/in/me
Speed It Up
For lots of URLs, increase workers:
results = check_urls_batch(urls, timeout=timeout, max_workers=20)
| URLs | Workers | Approx Time |
|---|---|---|
| 1-50 | 5-10 | 10-30 sec |
| 51-200 | 10-15 | 30-60 sec |
| 200+ | 15-25 | 1-3 min |
Reading the Output
| Code | Meaning | What To Do |
|---|---|---|
| 200 | Works fine | Nothing |
| 301/302 | Redirect | Update if permanent |
| 404 | Not found | Fix or remove link |
| 500 | Server error | Contact site owner |
| Timeout | Too slow | Check connection or increase timeout |
| Connection Error | DNS/network | Check URL spelling |
Output File
problematic_urls.txt gets created with broken links organized by error type.
Use Cases
- Site audits: Check your external links
- SEO: Validate backlinks
- API monitoring: Check endpoint health
- Competitor tracking: Monitor if competitor sites are down
Customizations
Add User-Agent
Some sites block scripts without a user agent:
headers = {'User-Agent': 'Mozilla/5.0 ...'}
response = requests.get(url, headers=headers, ...)
Export to CSV
import csv
def save_to_csv(results, filename='results.csv'):
with open(filename, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['url', 'status', ...])
writer.writeheader()
writer.writerows(results)
CI/CD Integration
# GitHub Actions - check URLs weekly
name: URL Check
on:
schedule:
- cron: "0 9 * * 1"
jobs:
check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- run: curl -LsSf https://astral.sh/uv/install.sh | sh
- run: uv run url_checker.py
Wrap Up
That’s it. A simple script that checks URLs fast and tells you what’s broken. No virtual environments to manage, no dependencies to install manually. Just uv run and go.
I use this regularly to keep sites clean. Works for me, should work for you too.