84 lines
2.3 KiB
Python
84 lines
2.3 KiB
Python
import re
|
|
from typing import Any, Optional
|
|
from urllib.parse import urlsplit, urlunsplit
|
|
|
|
_SCHEMES = {"http", "https"}
|
|
_LABEL_RE = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?$")
|
|
_TLD_RE = re.compile(r"^[A-Za-z]{2,63}$")
|
|
|
|
def _is_valid_host(host: str) -> bool:
|
|
if not host or len(host) > 253:
|
|
return False
|
|
if host.endswith("."):
|
|
host = host[:-1]
|
|
parts = host.split(".")
|
|
if len(parts) < 2:
|
|
return False
|
|
if not all(_LABEL_RE.match(p) for p in parts):
|
|
return False
|
|
if not _TLD_RE.match(parts[-1]):
|
|
return False
|
|
return True
|
|
|
|
def _normalize_host(host: str) -> str:
|
|
host = host.strip().strip(".").lower()
|
|
host = re.sub(r"\.{2,}", ".", host) # colapsa '..' -> '.'
|
|
return host
|
|
|
|
def normalizar_url_para_insert(texto: Any) -> Optional[str]:
|
|
"""
|
|
Devuelve una URL normalizada lista para insertar en BD.
|
|
Si no es válida, devuelve None (-> SQL NULL).
|
|
"""
|
|
if texto is None:
|
|
return None
|
|
|
|
s = str(texto).strip()
|
|
if not s:
|
|
return None
|
|
|
|
# Correcciones ligeras
|
|
s = s.replace(",", ".")
|
|
s = re.sub(r"\s+", "", s)
|
|
|
|
# Añadir esquema por defecto si falta
|
|
candidate = s if "://" in s else f"http://{s}"
|
|
|
|
sp = urlsplit(candidate)
|
|
scheme = (sp.scheme or "http").lower()
|
|
netloc, path = sp.netloc, sp.path
|
|
|
|
# Caso: dominio sin esquema puede quedar en path
|
|
if not netloc and path and "." in path and "/" not in path:
|
|
netloc, path = path, ""
|
|
|
|
# Separar userinfo/host:port (no soportamos IPv6 con corchetes aquí)
|
|
hostport = netloc.rsplit("@", 1)[-1]
|
|
host, port = hostport, ""
|
|
if ":" in hostport:
|
|
h, p = hostport.rsplit(":", 1)
|
|
if h and p.isdigit():
|
|
host, port = h, p
|
|
|
|
host = _normalize_host(host)
|
|
|
|
# Validaciones
|
|
if scheme not in _SCHEMES:
|
|
return None
|
|
if not _is_valid_host(host):
|
|
return None
|
|
if port:
|
|
try:
|
|
pi = int(port)
|
|
if not (1 <= pi <= 65535):
|
|
return None
|
|
except ValueError:
|
|
return None
|
|
|
|
# Reconstrucción netloc (preservando userinfo si existía)
|
|
userinfo = netloc[:-len(hostport)] if netloc.endswith(hostport) else ""
|
|
new_netloc = f"{userinfo}{host}{(':' + port) if port else ''}"
|
|
|
|
fixed = urlunsplit((scheme, new_netloc, path or "", sp.query, sp.fragment))
|
|
return fixed
|