import re from typing import Any, Optional from urllib.parse import urlsplit, urlunsplit _SCHEMES = {"http", "https"} _LABEL_RE = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?$") _TLD_RE = re.compile(r"^[A-Za-z]{2,63}$") def _is_valid_host(host: str) -> bool: if not host or len(host) > 253: return False if host.endswith("."): host = host[:-1] parts = host.split(".") if len(parts) < 2: return False if not all(_LABEL_RE.match(p) for p in parts): return False if not _TLD_RE.match(parts[-1]): return False return True def _normalize_host(host: str) -> str: host = host.strip().strip(".").lower() host = re.sub(r"\.{2,}", ".", host) # colapsa '..' -> '.' return host def normalizar_url_para_insert(texto: Any) -> Optional[str]: """ Devuelve una URL normalizada lista para insertar en BD. Si no es válida, devuelve None (-> SQL NULL). """ if texto is None: return None s = str(texto).strip() if not s: return None # Correcciones ligeras s = s.replace(",", ".") s = re.sub(r"\s+", "", s) # Añadir esquema por defecto si falta candidate = s if "://" in s else f"http://{s}" sp = urlsplit(candidate) scheme = (sp.scheme or "http").lower() netloc, path = sp.netloc, sp.path # Caso: dominio sin esquema puede quedar en path if not netloc and path and "." in path and "/" not in path: netloc, path = path, "" # Separar userinfo/host:port (no soportamos IPv6 con corchetes aquí) hostport = netloc.rsplit("@", 1)[-1] host, port = hostport, "" if ":" in hostport: h, p = hostport.rsplit(":", 1) if h and p.isdigit(): host, port = h, p host = _normalize_host(host) # Validaciones if scheme not in _SCHEMES: return None if not _is_valid_host(host): return None if port: try: pi = int(port) if not (1 <= pi <= 65535): return None except ValueError: return None # Reconstrucción netloc (preservando userinfo si existía) userinfo = netloc[:-len(hostport)] if netloc.endswith(hostport) else "" new_netloc = f"{userinfo}{host}{(':' + port) if port else ''}" fixed = urlunsplit((scheme, new_netloc, path or "", sp.query, sp.fragment)) return fixed