Uecko_ERP_FactuGES_sync/app/utils/websites_helper.py
2025-11-05 18:43:40 +01:00

84 lines
2.3 KiB
Python

import re
from typing import Any, Optional
from urllib.parse import urlsplit, urlunsplit
_SCHEMES = {"http", "https"}
_LABEL_RE = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?$")
_TLD_RE = re.compile(r"^[A-Za-z]{2,63}$")
def _is_valid_host(host: str) -> bool:
if not host or len(host) > 253:
return False
if host.endswith("."):
host = host[:-1]
parts = host.split(".")
if len(parts) < 2:
return False
if not all(_LABEL_RE.match(p) for p in parts):
return False
if not _TLD_RE.match(parts[-1]):
return False
return True
def _normalize_host(host: str) -> str:
host = host.strip().strip(".").lower()
host = re.sub(r"\.{2,}", ".", host) # colapsa '..' -> '.'
return host
def normalizar_url_para_insert(texto: Any) -> Optional[str]:
"""
Devuelve una URL normalizada lista para insertar en BD.
Si no es válida, devuelve None (-> SQL NULL).
"""
if texto is None:
return None
s = str(texto).strip()
if not s:
return None
# Correcciones ligeras
s = s.replace(",", ".")
s = re.sub(r"\s+", "", s)
# Añadir esquema por defecto si falta
candidate = s if "://" in s else f"http://{s}"
sp = urlsplit(candidate)
scheme = (sp.scheme or "http").lower()
netloc, path = sp.netloc, sp.path
# Caso: dominio sin esquema puede quedar en path
if not netloc and path and "." in path and "/" not in path:
netloc, path = path, ""
# Separar userinfo/host:port (no soportamos IPv6 con corchetes aquí)
hostport = netloc.rsplit("@", 1)[-1]
host, port = hostport, ""
if ":" in hostport:
h, p = hostport.rsplit(":", 1)
if h and p.isdigit():
host, port = h, p
host = _normalize_host(host)
# Validaciones
if scheme not in _SCHEMES:
return None
if not _is_valid_host(host):
return None
if port:
try:
pi = int(port)
if not (1 <= pi <= 65535):
return None
except ValueError:
return None
# Reconstrucción netloc (preservando userinfo si existía)
userinfo = netloc[:-len(hostport)] if netloc.endswith(hostport) else ""
new_netloc = f"{userinfo}{host}{(':' + port) if port else ''}"
fixed = urlunsplit((scheme, new_netloc, path or "", sp.query, sp.fragment))
return fixed