382 lines
14 KiB
Python
382 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to scrape region data from wplace.live API and generate a CSV mapping.
|
|
|
|
This creates a tile-to-region mapping by sampling one pixel per tile.
|
|
The region is determined by tile coordinates, not individual pixels.
|
|
"""
|
|
|
|
import csv
|
|
import time
|
|
import json
|
|
import os
|
|
from typing import Optional, Dict
|
|
import sys
|
|
|
|
try:
|
|
import cloudscraper
|
|
print("✓ Using cloudscraper to bypass Cloudflare protection")
|
|
except ImportError:
|
|
print("⚠️ cloudscraper not found. Install it with: pip install cloudscraper")
|
|
print(" This is required to bypass Cloudflare protection on wplace.live")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
import socks
|
|
print("✓ SOCKS proxy support available")
|
|
except ImportError:
|
|
print("⚠️ PySocks not found. Install it with: pip install pysocks")
|
|
print(" This is required to use SOCKS5 proxies")
|
|
sys.exit(1)
|
|
|
|
# Configuration
|
|
BASE_URL = "https://backend.wplace.live/s0/pixel"
|
|
OUTPUT_CSV = "tile_region_mapping.csv"
|
|
OUTPUT_REGIONS_CSV = "regions.csv"
|
|
|
|
# Sample density - adjust based on how detailed you want the mapping
|
|
# 1 = sample all tiles, 2 = sample every other tile, etc.
|
|
TILE_SAMPLE_STEP = 1 # Sample every Nth tile
|
|
|
|
# Tile range - adjust based on the canvas size
|
|
TILE_X_MIN, TILE_X_MAX = 0, 2047
|
|
TILE_Y_MIN, TILE_Y_MAX = 0, 2047
|
|
|
|
# Proxy rotation - helps avoid rate limits and Cloudflare blocks
|
|
PROXIES = [
|
|
"socks5://spmhmfozio:ze1sg%2BsP3n4apXhDV9@isp.decodo.com:10001",
|
|
"socks5://user-spcwoviqpj-sessionduration-1440:aw2igK7QDgscu~41Gl@gate.decodo.com:10001",
|
|
"socks5://spk2ihoy6o:ympO0wyr9X32%2BgXRfj@isp.decodo.com:10010"
|
|
]
|
|
|
|
# Track current proxy index for rotation
|
|
current_proxy_index = 0
|
|
|
|
# Rate limit backoff settings
|
|
rate_limit_backoff_seconds = 0 # Exponential backoff for rate limits
|
|
consecutive_rate_limits = 0 # Track consecutive rate limit errors
|
|
|
|
# Create a cloudscraper session that can bypass Cloudflare
|
|
scraper = cloudscraper.create_scraper(
|
|
browser={
|
|
'browser': 'chrome',
|
|
'platform': 'windows',
|
|
'mobile': False
|
|
}
|
|
)
|
|
|
|
def get_next_proxy() -> Dict[str, str]:
|
|
"""Get the next proxy in rotation."""
|
|
global current_proxy_index
|
|
proxy_url = PROXIES[current_proxy_index]
|
|
current_proxy_index = (current_proxy_index + 1) % len(PROXIES)
|
|
return {
|
|
'http': proxy_url,
|
|
'https': proxy_url
|
|
}
|
|
|
|
def detect_cloudflare_rate_limit(response) -> bool:
|
|
"""Detect if response is a Cloudflare 1015 rate limit error."""
|
|
# Check for 1015 error code in response
|
|
if response.status_code == 429:
|
|
return True
|
|
|
|
# Check for Cloudflare rate limit page (error 1015)
|
|
if 'text/html' in response.headers.get('Content-Type', ''):
|
|
if b'error 1015' in response.content.lower() or b'rate limited' in response.content.lower():
|
|
return True
|
|
|
|
# Check for specific Cloudflare headers
|
|
cf_ray = response.headers.get('CF-RAY', '')
|
|
if cf_ray and response.status_code in [403, 429, 503]:
|
|
return True
|
|
|
|
return False
|
|
|
|
def handle_rate_limit_backoff():
|
|
"""Handle exponential backoff when rate limited."""
|
|
global rate_limit_backoff_seconds, consecutive_rate_limits
|
|
|
|
consecutive_rate_limits += 1
|
|
|
|
# Exponential backoff: 5s, 10s, 20s, 40s, 60s (max)
|
|
if consecutive_rate_limits == 1:
|
|
rate_limit_backoff_seconds = 5
|
|
elif consecutive_rate_limits == 2:
|
|
rate_limit_backoff_seconds = 10
|
|
elif consecutive_rate_limits == 3:
|
|
rate_limit_backoff_seconds = 20
|
|
elif consecutive_rate_limits == 4:
|
|
rate_limit_backoff_seconds = 40
|
|
else:
|
|
rate_limit_backoff_seconds = 60
|
|
|
|
print(f"\n🛑 CLOUDFLARE RATE LIMIT DETECTED (Error 1015)")
|
|
print(f" Consecutive rate limits: {consecutive_rate_limits}")
|
|
print(f" Backing off for {rate_limit_backoff_seconds} seconds...")
|
|
time.sleep(rate_limit_backoff_seconds)
|
|
|
|
def reset_rate_limit_backoff():
|
|
"""Reset backoff when we get successful responses."""
|
|
global rate_limit_backoff_seconds, consecutive_rate_limits
|
|
consecutive_rate_limits = 0
|
|
rate_limit_backoff_seconds = 0
|
|
|
|
def fetch_tile_region(tile_x: int, tile_y: int) -> Optional[Dict]:
|
|
"""Fetch region info for a tile from wplace.live API.
|
|
|
|
Since region is determined by tile, we just need to check one pixel per tile.
|
|
We'll use coordinates (1, 1) as a sample point.
|
|
"""
|
|
url = f"{BASE_URL}/{tile_x}/{tile_y}?x=1&y=1"
|
|
proxies = get_next_proxy()
|
|
|
|
try:
|
|
response = scraper.get(url, proxies=proxies, timeout=15)
|
|
|
|
# Check for Cloudflare rate limiting (error 1015)
|
|
if detect_cloudflare_rate_limit(response):
|
|
handle_rate_limit_backoff()
|
|
# Retry with same proxy after backoff
|
|
response = scraper.get(url, proxies=proxies, timeout=15)
|
|
if detect_cloudflare_rate_limit(response):
|
|
print(f" Still rate limited after backoff. Skipping tile ({tile_x}, {tile_y})")
|
|
return None
|
|
|
|
# Check if we got HTML (Cloudflare challenge) instead of JSON
|
|
content_type = response.headers.get('Content-Type', '')
|
|
if 'text/html' in content_type and not detect_cloudflare_rate_limit(response):
|
|
print(f"\n⚠️ Received HTML instead of JSON for tile ({tile_x}, {tile_y})")
|
|
print(f" This might be a Cloudflare challenge page. Waiting 5 seconds...")
|
|
time.sleep(5)
|
|
# Retry once
|
|
response = scraper.get(url, proxies=proxies, timeout=15)
|
|
content_type = response.headers.get('Content-Type', '')
|
|
if 'text/html' in content_type:
|
|
print(f" Still getting HTML. Skipping this tile.")
|
|
return None
|
|
|
|
if response.status_code == 200:
|
|
# Success! Reset rate limit backoff
|
|
reset_rate_limit_backoff()
|
|
try:
|
|
data = response.json()
|
|
# Extract region if it exists
|
|
if 'region' in data:
|
|
return data['region']
|
|
return None
|
|
except json.JSONDecodeError:
|
|
print(f"\n⚠️ Failed to parse JSON for tile ({tile_x}, {tile_y})")
|
|
return None
|
|
elif response.status_code == 404:
|
|
# No pixel painted at (1,1), try center of tile
|
|
url_center = f"{BASE_URL}/{tile_x}/{tile_y}?x=500&y=500"
|
|
response = scraper.get(url_center, proxies=proxies, timeout=15)
|
|
|
|
# Check for rate limit on retry
|
|
if detect_cloudflare_rate_limit(response):
|
|
handle_rate_limit_backoff()
|
|
return None
|
|
|
|
if response.status_code == 200:
|
|
try:
|
|
data = response.json()
|
|
if 'region' in data:
|
|
reset_rate_limit_backoff()
|
|
return data['region']
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return None
|
|
elif response.status_code == 403:
|
|
print(f"\n⚠️ Got 403 Forbidden for tile ({tile_x}, {tile_y})")
|
|
print(f" You may be rate limited. Consider increasing the delay.")
|
|
return None
|
|
elif response.status_code == 429:
|
|
print(f"\n⚠️ Got 429 Too Many Requests for tile ({tile_x}, {tile_y})")
|
|
handle_rate_limit_backoff()
|
|
return None
|
|
else:
|
|
print(f"\n⚠️ Got status {response.status_code} for tile ({tile_x}, {tile_y})")
|
|
return None
|
|
except Exception as e:
|
|
print(f"\n⚠️ Error fetching tile ({tile_x}, {tile_y}): {e}")
|
|
return None
|
|
|
|
def load_already_scraped_tiles() -> set:
|
|
"""Load tiles that have already been scraped from the CSV file."""
|
|
scraped_tiles = set()
|
|
|
|
if os.path.exists(OUTPUT_CSV):
|
|
print(f"📂 Found existing {OUTPUT_CSV}, loading already scraped tiles...")
|
|
try:
|
|
with open(OUTPUT_CSV, 'r', newline='', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
tile_x = int(row['tile_x'])
|
|
tile_y = int(row['tile_y'])
|
|
scraped_tiles.add((tile_x, tile_y))
|
|
print(f" ✓ Loaded {len(scraped_tiles)} already scraped tiles")
|
|
except Exception as e:
|
|
print(f" ⚠️ Error reading existing CSV: {e}")
|
|
print(f" Starting fresh...")
|
|
scraped_tiles.clear()
|
|
|
|
return scraped_tiles
|
|
|
|
def load_unique_regions() -> dict:
|
|
"""Load unique regions from the regions CSV file."""
|
|
unique_regions = {}
|
|
|
|
if os.path.exists(OUTPUT_REGIONS_CSV):
|
|
print(f"📂 Found existing {OUTPUT_REGIONS_CSV}, loading unique regions...")
|
|
try:
|
|
with open(OUTPUT_REGIONS_CSV, 'r', newline='', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
region_key = (int(row['id']), int(row['city_id']))
|
|
unique_regions[region_key] = row
|
|
print(f" ✓ Loaded {len(unique_regions)} unique regions")
|
|
except Exception as e:
|
|
print(f" ⚠️ Error reading regions CSV: {e}")
|
|
|
|
return unique_regions
|
|
|
|
def append_tile_to_csv(tile_data: dict):
|
|
"""Append a single tile to the CSV file."""
|
|
file_exists = os.path.exists(OUTPUT_CSV)
|
|
|
|
with open(OUTPUT_CSV, 'a', newline='', encoding='utf-8') as f:
|
|
fieldnames = ['tile_x', 'tile_y', 'region_id', 'city_id',
|
|
'region_name', 'region_number', 'country_id', 'flag_id']
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
|
|
# Write header if file is new
|
|
if not file_exists:
|
|
writer.writeheader()
|
|
|
|
writer.writerow(tile_data)
|
|
|
|
def update_regions_csv(unique_regions: dict):
|
|
"""Update the regions CSV file with all unique regions."""
|
|
with open(OUTPUT_REGIONS_CSV, 'w', newline='', encoding='utf-8') as f:
|
|
fieldnames = ['id', 'city_id', 'name', 'number', 'country_id', 'flag_id']
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(unique_regions.values())
|
|
|
|
def main():
|
|
print("Starting wplace.live region data scraping...")
|
|
print(f"Using {len(PROXIES)} rotating proxies for requests")
|
|
print(f"Tile range: X({TILE_X_MIN}-{TILE_X_MAX}), Y({TILE_Y_MIN}-{TILE_Y_MAX})")
|
|
print(f"Sampling every {TILE_SAMPLE_STEP} tile(s)")
|
|
|
|
tiles_x = (TILE_X_MAX - TILE_X_MIN + 1) // TILE_SAMPLE_STEP
|
|
tiles_y = (TILE_Y_MAX - TILE_Y_MIN + 1) // TILE_SAMPLE_STEP
|
|
total_to_sample = tiles_x * tiles_y
|
|
|
|
print(f"Total tiles to sample: {total_to_sample}")
|
|
print()
|
|
|
|
# Load already scraped tiles for resume support
|
|
already_scraped = load_already_scraped_tiles()
|
|
|
|
# Load unique regions
|
|
unique_regions = load_unique_regions()
|
|
|
|
total_tiles = 0
|
|
successful_tiles = 0
|
|
skipped_tiles = len(already_scraped)
|
|
last_success_time = time.time()
|
|
|
|
# Sample across tiles (one sample per tile is enough since region is tile-based)
|
|
for tile_x in range(TILE_X_MIN, TILE_X_MAX + 1, TILE_SAMPLE_STEP):
|
|
for tile_y in range(TILE_Y_MIN, TILE_Y_MAX + 1, TILE_SAMPLE_STEP):
|
|
total_tiles += 1
|
|
|
|
# Skip if already scraped
|
|
if (tile_x, tile_y) in already_scraped:
|
|
continue
|
|
|
|
print(f"Sampling tile ({tile_x}, {tile_y})... [{successful_tiles} new, {skipped_tiles} skipped/cached]", end="\r")
|
|
|
|
region = fetch_tile_region(tile_x, tile_y)
|
|
|
|
if region:
|
|
# Prepare tile data
|
|
tile_data = {
|
|
'tile_x': tile_x,
|
|
'tile_y': tile_y,
|
|
'region_id': region.get('id'),
|
|
'city_id': region.get('cityId'),
|
|
'region_name': region.get('name'),
|
|
'region_number': region.get('number'),
|
|
'country_id': region.get('countryId'),
|
|
'flag_id': region.get('flagId')
|
|
}
|
|
|
|
# Immediately write to CSV (for resume support)
|
|
append_tile_to_csv(tile_data)
|
|
|
|
# Store unique region
|
|
region_key = (region.get('id'), region.get('cityId'))
|
|
if region_key not in unique_regions:
|
|
unique_regions[region_key] = {
|
|
'id': region.get('id'),
|
|
'city_id': region.get('cityId'),
|
|
'name': region.get('name'),
|
|
'number': region.get('number'),
|
|
'country_id': region.get('countryId'),
|
|
'flag_id': region.get('flagId')
|
|
}
|
|
# Update regions CSV periodically
|
|
update_regions_csv(unique_regions)
|
|
|
|
successful_tiles += 1
|
|
last_success_time = time.time()
|
|
else:
|
|
skipped_tiles += 1
|
|
|
|
# If we've had too many failures in a row, warn the user
|
|
if time.time() - last_success_time > 60:
|
|
print(f"\n⚠️ No successful requests in the last 60 seconds.")
|
|
print(f" You may be blocked by Cloudflare. Consider:")
|
|
print(f" 1. Increasing TILE_SAMPLE_STEP to reduce request rate")
|
|
print(f" 2. Taking a break and trying again later")
|
|
print(f" 3. Using a VPN or different IP address")
|
|
print(f" Progress is saved! You can resume by running this script again.")
|
|
user_input = input("Continue? (y/n): ")
|
|
if user_input.lower() != 'y':
|
|
break
|
|
last_success_time = time.time()
|
|
|
|
# Rate limiting - be nice to their server
|
|
# Add extra delay if we've been rate limited recently
|
|
base_delay = 0.1
|
|
if consecutive_rate_limits > 0:
|
|
# Slow down if we've been rate limited
|
|
base_delay = 0.5
|
|
time.sleep(base_delay)
|
|
|
|
else:
|
|
continue
|
|
break
|
|
|
|
print()
|
|
print(f"\n✅ Scanning complete!")
|
|
print(f"Total tiles processed: {total_tiles}")
|
|
print(f"New tiles scraped: {successful_tiles}")
|
|
print(f"Skipped/cached tiles: {skipped_tiles}")
|
|
print(f"Unique regions found: {len(unique_regions)}")
|
|
|
|
# Final update of regions CSV
|
|
if unique_regions:
|
|
update_regions_csv(unique_regions)
|
|
print(f"✓ Final regions list saved to {OUTPUT_REGIONS_CSV}")
|
|
|
|
print(f"✓ Tile-to-region mappings in {OUTPUT_CSV}")
|
|
print("\n💡 Tip: You can resume this script anytime - progress is automatically saved!")
|
|
print(" Done! You can now import this data into your database.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|