Starseed/docs/migration/extract_mixgraine.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extraction + normalisation des tiers (clients / fournisseurs) depuis le CRM
Mixgraine (https://liot.mixsuite.fr) vers le format des entites Client / Supplier
de Starseed.

Principe :
  1. Pagine GET /api/customer/?...&page=N pour collecter tous les id.
  2. Pour chaque id, recupere la fiche COMPLETE via
     PUT /api/customer/{id}  body {"__data": true}
     (c'est l'appel que fait le front pour PRECHARGER le formulaire d'edition :
      il NE MODIFIE RIEN, il renvoie le schema + les valeurs courantes).
  3. Resout les selects (paymentType, banque, pays, distributeur, sites...) via
     le schema renvoye, puis normalise chaque tiers au format Starseed.
  4. Ecrit clients.json, suppliers.json, referentials.json + un rapport.

Caracteristiques :
  - Zero dependance (stdlib uniquement).
  - Cache disque par id (reprise apres interruption, pas de refetch).
  - Debit volontairement lent (--delay) pour ne pas saturer le serveur.
  - Backoff automatique sur erreur reseau / 429 / 5xx.

Usage :
  export MIXGRAINE_JWT="eyJ0eXAi..."          # ton token Bearer (NE PAS committer)
  python3 extract_mixgraine.py                # extraction complete
  python3 extract_mixgraine.py --delay 1.0    # encore plus doux
  python3 extract_mixgraine.py --limit-ids 20 # test rapide sur 20 tiers

Le JWT est un secret de session : passe-le par variable d'environnement,
ne l'ecris jamais en dur ici.
"""

import argparse
import json
import os
import re
import sys
import time
import urllib.error
import urllib.parse
import urllib.request

BASE = os.environ.get("MIXGRAINE_BASE", "https://liot.mixsuite.fr")
JWT = os.environ.get("MIXGRAINE_JWT") or os.environ.get("LAUTTREE_JWT", "")

# --- Tables de correspondance Mixgraine -> codes referentiels Starseed ---------

TVA_MODE = {
    "France (ventes)": "FRANCE_VENTES",
    "Export (ventes)": "EXPORT_VENTES",
    "Intracom (ventes)": "INTRACOM_VENTES",
    "France (achats)": "FRANCE_VENTES",  # pas de mode "achats" au seed -> a trancher
}
PAYMENT_DELAY = {
    "15 jours": "J15",
    "20 jours": "J20",      # absent du seed Starseed -> a creer
    "30 jours": "J30",
    "A reception": "A_RECEPTION",
    "A réception": "A_RECEPTION",
}
PAYMENT_TYPE = {
    "LCR non soumise": "NON_SOUMISE",  # pas LCR : on n'a pas toujours de RIB (RG-1.13)
    "Virement": "VIREMENT",
    "Cheque": "CHEQUE",
    "Chèque": "CHEQUE",
}
BANK = {
    "CIC": "CIC",
    "SOCIETE GENERALE": "SG",
    "CREDIT AGRICOLE": "CA",
}

CIVILITES = ("Mme", "Mlle", "Mle", "M.", "Mr", "M")  # ordre : plus long d'abord


# --- Petites fonctions utilitaires -------------------------------------------

def http(method, path, body=None, tries=5):
    """Appel HTTP avec retry/backoff. Renvoie le JSON decode."""
    url = BASE + path
    data = json.dumps(body).encode("utf-8") if body is not None else None
    headers = {
        "Accept": "application/json, text/plain, */*",
        "Authorization": "Bearer " + JWT,
    }
    if data is not None:
        headers["Content-Type"] = "application/json"
    delay = 2.0
    for attempt in range(1, tries + 1):
        req = urllib.request.Request(url, data=data, headers=headers, method=method)
        try:
            with urllib.request.urlopen(req, timeout=60) as resp:
                return json.loads(resp.read().decode("utf-8"))
        except urllib.error.HTTPError as e:
            if e.code in (429, 500, 502, 503, 504) and attempt < tries:
                print(f"  ! HTTP {e.code} sur {path} -> retry dans {delay:.0f}s", file=sys.stderr)
                time.sleep(delay)
                delay *= 2
                continue
            raise
        except (urllib.error.URLError, TimeoutError) as e:
            if attempt < tries:
                print(f"  ! reseau ({e}) -> retry dans {delay:.0f}s", file=sys.stderr)
                time.sleep(delay)
                delay *= 2
                continue
            raise
    raise RuntimeError("echec apres %d tentatives : %s" % (tries, path))


def choices_map(field):
    """Construit {value: label} depuis un champ select du schema."""
    out = {}
    try:
        for c in field["type"]["choices"]:
            out[c["value"]] = c["label"]
    except (KeyError, TypeError):
        pass
    return out


def first_id(val):
    """Mixgraine renvoie soit un id, soit [] (vide), soit [{id:..}]."""
    if isinstance(val, list):
        return val[0]["id"] if val and isinstance(val[0], dict) else None
    return val if val not in ("", None) else None


def parse_contact_name(name):
    """'M.ROBERT Florian' -> (lastName='ROBERT', firstName='Florian')."""
    if not name:
        return None, None
    s = name.strip()
    for civ in CIVILITES:
        if s.upper().startswith(civ.upper()):
            s = s[len(civ):].strip(" .")
            break
    parts = [p for p in s.split() if p]
    if not parts:
        return None, None
    if len(parts) == 1:
        return parts[0], None          # un seul mot -> nom de famille
    return parts[0], " ".join(parts[1:])  # 1er = nom, reste = prenom


def clean_phone(p):
    """Tronque/nettoie pour tenir dans 20 caracteres (limite Starseed)."""
    if not p:
        return None, None
    raw = str(p).strip()
    # garde le 1er numero si plusieurs ('... (direct) / ... (standard)')
    candidate = re.split(r"[/(]", raw)[0].strip()
    cleaned = candidate if candidate else raw
    flag = None
    if len(cleaned) > 20:
        flag = f"tel tronque ({raw!r})"
        cleaned = cleaned[:20].strip()
    return cleaned, flag


POSTCODE_RE = re.compile(r"^\d{4,5}$")


def clean_postcode(p):
    if not p:
        return None, None
    s = str(p).strip()
    if POSTCODE_RE.match(s):
        return s, None
    return None, f"code postal invalide ({p!r})"


# --- Normalisation d'un tiers -------------------------------------------------

def normalize(record, warnings):
    """record = reponse PUT {__data:true}. Renvoie un dict normalise Starseed."""
    fields = record.get("fields", {})
    d = record.get("__data", {})
    details = record.get("details", {})
    geo = details.get("geo", {}) or {}

    tid = d.get("id")
    name = d.get("name") or d.get("reference")

    # --- resolveurs depuis le schema de CE record ---
    liab = choices_map(fields.get("liability", {}))
    pdelay = choices_map(fields.get("paymentDelay", {}))
    ptype = choices_map(fields.get("paymentType", {}))
    bank = choices_map(fields.get("accountingBank", {}))
    distrib = choices_map(fields.get("distributor", {}))
    courtier = choices_map(fields.get("courtier", {}))
    cats = choices_map(fields.get("categories", {}))
    addr_fields = fields.get("addresses", {}).get("type", {}).get("fields", {})
    country_map = choices_map(addr_fields.get("country", {}))
    addr_cats = choices_map(addr_fields.get("categories", {}))
    carrier_map = choices_map(addr_fields.get("carrierType", {}))
    # libelles des sites (organisations)
    site_labels = {
        "organization_1": addr_fields.get("organization_1", {}).get("label"),
        "organization_2": addr_fields.get("organization_2", {}).get("label"),
        "organization_3": addr_fields.get("organization_3", {}).get("label"),
    }

    def map_ref(table, label, what):
        if label is None:
            return None
        code = table.get(label)
        if code is None:
            warnings.append(f"tiers {tid} ({name}): {what} non mappe : {label!r}")
        return code

    # --- referentiels comptables ---
    tva = map_ref(TVA_MODE, liab.get(first_id(d.get("liability"))), "tvaMode")
    delay = map_ref(PAYMENT_DELAY, pdelay.get(first_id(d.get("paymentDelay"))), "paymentDelay")
    pay = map_ref(PAYMENT_TYPE, ptype.get(first_id(d.get("paymentType"))), "paymentType")
    bnk = map_ref(BANK, bank.get(first_id(d.get("accountingBank"))), "bank")

    # --- categories tiers ---
    categories = []
    for c in d.get("categories", []) or []:
        lbl = cats.get(c.get("id"))
        if lbl:
            categories.append(lbl)
    if not categories:
        categories = ["A QUALIFIER"]  # contrainte min 1 cote Starseed
        warnings.append(f"tiers {tid} ({name}): aucune categorie -> 'A QUALIFIER'")

    # --- contacts ---
    contacts = []
    contact_phones = set()
    for c in d.get("contacts", []) or []:
        last, first = parse_contact_name(c.get("name"))
        phone, f1 = clean_phone(c.get("phone"))
        mobile, f2 = clean_phone(c.get("mobile"))
        if f1:
            warnings.append(f"tiers {tid} ({name}): {f1}")
        if f2:
            warnings.append(f"tiers {tid} ({name}): {f2}")
        if not last and not first:
            last = "Standard"  # RG-1.05/2.04 : au moins un nom
        for ph in (phone, mobile):
            if ph:
                contact_phones.add(re.sub(r"\D", "", ph))
        contacts.append({
            "mixgraineId": c.get("id"),
            "lastName": last,
            "firstName": first,
            "jobTitle": c.get("function"),
            "email": (c.get("email") or None),
            "phonePrimary": phone,
            "phoneSecondary": mobile,
        })

    # tel porte par l'objet de base -> dans la liste de contacts (jamais a la racine)
    base_phone, fb = clean_phone(d.get("phone"))
    if fb:
        warnings.append(f"tiers {tid} ({name}): {fb}")
    if base_phone and re.sub(r"\D", "", base_phone) not in contact_phones:
        if contacts:
            # complete le 1er contact sans tel secondaire
            for c in contacts:
                if not c["phoneSecondary"]:
                    c["phoneSecondary"] = base_phone
                    break
            else:
                contacts[0]["phoneSecondary"] = base_phone
        else:
            contacts.append({
                "mixgraineId": None, "lastName": "Standard", "firstName": None,
                "jobTitle": None, "email": None,
                "phonePrimary": base_phone, "phoneSecondary": None,
            })

    # --- emails de facturation (mails[] avec invoice=true) ---
    billing_mails = [m["mail"] for m in (d.get("mails") or []) if m.get("invoice") and m.get("mail")]

    # --- adresses ---
    addresses = []
    for a in d.get("addresses", []) or []:
        pc, fp = clean_postcode(a.get("postcode"))
        if fp:
            warnings.append(f"tiers {tid} ({name}): {fp}")
        # sites depuis les booleens organization_n
        sites = [site_labels[k] for k in ("organization_1", "organization_2", "organization_3")
                 if a.get(k) and site_labels[k]]
        # categories d'adresse
        acats = [addr_cats.get(c.get("id")) for c in (a.get("categories") or []) if addr_cats.get(c.get("id"))]
        # type d'adresse fournisseur (Rendu/Depart) depuis carrierType
        carrier = carrier_map.get(a.get("carrierType"))
        supplier_addr_type = {"Rendu": "RENDU", "Départ": "DEPART", "Depart": "DEPART"}.get(carrier)
        latlng = geo.get(str(a.get("id"))) or geo.get(a.get("id"))
        lat, lng = (latlng.split(",") + [None, None])[:2] if isinstance(latlng, str) else (None, None)
        addresses.append({
            "mixgraineId": a.get("id"),
            "street": a.get("street1"),
            "streetComplement": a.get("street2"),
            "postalCode": pc,
            "city": a.get("city"),
            "country": country_map.get(a.get("country"), "France"),
            # flags client
            "isBilling": bool(a.get("billing")),
            "isDelivery": bool(a.get("sales")),
            "isProspect": bool(a.get("salesTrip")),
            # type fournisseur
            "supplierAddressType": supplier_addr_type or "PROSPECT",
            "bennes": a.get("benneCount"),
            "sites": sites,
            "categories": acats,
            "billingEmail": (billing_mails[0] if (a.get("billing") and billing_mails) else None),
            "contactMixgraineIds": [c.get("id") for c in (a.get("contacts") or [])],
            "lat": lat, "lng": lng,  # conserve pour info (pas de cible Starseed)
        })

    # --- RIB (banks[]) ---
    ribs = [{
        "label": b.get("label") or "Compte principal",
        "iban": b.get("iban"),
        "bic": b.get("bic"),
    } for b in (d.get("banks") or []) if b.get("iban")]

    return {
        "mixgraineId": tid,
        "companyName": name,
        "isCustomer": bool(d.get("customer")),
        "isSupplier": bool(d.get("supplier")),
        "accountNumber": d.get("billingAccount") or None,
        "nTva": d.get("vatNumber") or None,
        "tvaMode": tva,
        "paymentDelay": delay,
        "paymentType": pay,
        "bank": bnk,
        "distributorName": distrib.get(first_id(d.get("distributor"))),
        "brokerName": courtier.get(first_id(d.get("courtier"))),
        "categories": categories,
        "contacts": contacts,
        "addresses": addresses,
        "ribs": ribs,
    }


# --- Recuperation de la liste des id -----------------------------------------

def fetch_ids_for(filters, limit, delay):
    """Pagine /api/customer/ pour un jeu de filtres donne et renvoie la liste d'id."""
    fields = urllib.parse.quote('["name"]')
    fstr = urllib.parse.quote(json.dumps(filters)) if filters else ""
    ids, page, count = [], 0, None
    while True:
        path = f"/api/customer/?fields={fields}&limit={limit}&order=name&page={page}"
        if fstr:
            path += f"&filters={fstr}"
        resp = http("GET", path)
        if count is None:
            count = resp.get("count", 0)
        batch = resp.get("data", [])
        if not batch:
            break
        ids.extend(r["id"] for r in batch)
        page += 1
        if count and len(ids) >= count:
            break
        time.sleep(delay)
    return ids


def fetch_all_ids(limit, delay):
    """Collecte les id par groupe (client / fournisseur / prestataire) + union.

    On s'appuie sur l'APPARTENANCE aux listes filtrees pour classer chaque tiers,
    plus fiable que les flags parfois absents du formulaire __data.
    """
    print("  - clients (customer=true)")
    customer_ids = set(fetch_ids_for({"customer": True}, limit, delay))
    print(f"    {len(customer_ids)}")
    print("  - fournisseurs (supplier=true)")
    supplier_ids = set(fetch_ids_for({"supplier": True}, limit, delay))
    print(f"    {len(supplier_ids)}")
    print("  - prestataires (prestataire=true)  -> ranges en fournisseurs")
    prestataire_ids = set(fetch_ids_for({"prestataire": True}, limit, delay))
    print(f"    {len(prestataire_ids)}")

    all_ids = sorted(customer_ids | supplier_ids | prestataire_ids)
    print(f"  total tiers distincts : {len(all_ids)}")
    return all_ids, customer_ids, supplier_ids, prestataire_ids


# --- Main ---------------------------------------------------------------------

def main():
    ap = argparse.ArgumentParser(description="Extraction Mixgraine -> format Starseed")
    ap.add_argument("--out", default="mixgraine-export", help="dossier de sortie")
    ap.add_argument("--delay", type=float, default=1.0, help="pause (s) entre chaque fiche (defaut 1 req/s)")
    ap.add_argument("--limit", type=int, default=200, help="taille de page pour la liste")
    ap.add_argument("--limit-ids", type=int, default=0, help="ne traiter que N tiers (test)")
    args = ap.parse_args()

    if not JWT:
        sys.exit("ERREUR : export MIXGRAINE_JWT='<ton token>' avant de lancer.")

    cache_dir = os.path.join(args.out, "cache")
    os.makedirs(cache_dir, exist_ok=True)

    print("== Etape 1 : liste des id (par groupe) ==")
    ids, customer_ids, supplier_ids, prestataire_ids = fetch_all_ids(args.limit, args.delay)
    if args.limit_ids:
        ids = ids[:args.limit_ids]
    print(f"{len(ids)} tiers a recuperer.\n")

    print("== Etape 2 : fiches detaillees (lent, soyez patient) ==")
    raw_by_id = {}
    for n, tid in enumerate(ids, 1):
        cache_file = os.path.join(cache_dir, f"{tid}.json")
        if os.path.exists(cache_file):
            with open(cache_file, encoding="utf-8") as f:
                raw_by_id[tid] = json.load(f)
            continue
        rec = http("PUT", f"/api/customer/{tid}", body={"__data": True})
        with open(cache_file, "w", encoding="utf-8") as f:
            json.dump(rec, f, ensure_ascii=False)
        raw_by_id[tid] = rec
        if n % 25 == 0 or n == len(ids):
            print(f"  {n}/{len(ids)} fiches")
        time.sleep(args.delay)

    print("\n== Etape 3 : normalisation ==")
    warnings = []
    clients, suppliers, providers = [], [], []
    cat_set, site_set = set(), set()
    for tid in ids:
        norm = normalize(raw_by_id[tid], warnings)
        # classification par APPARTENANCE aux listes filtrees (source fiable) :
        #   customer  -> Client (module Commercial)
        #   supplier  -> Supplier (module Commercial)
        #   prestataire -> Provider (module Technique) — entite dediee, PAS un Supplier
        is_customer = tid in customer_ids or norm["isCustomer"]
        is_supplier = tid in supplier_ids or norm["isSupplier"]
        is_prestataire = tid in prestataire_ids
        norm["isCustomer"] = is_customer
        norm["isSupplier"] = is_supplier
        norm["isPrestataire"] = is_prestataire
        # Provider porte les sites DIRECTEMENT (RG-3.03) : on agrege les sites des adresses.
        norm["sites"] = sorted({s for a in norm["addresses"] for s in a["sites"]})
        cat_set.update(norm["categories"])
        for a in norm["addresses"]:
            site_set.update(a["sites"])
            cat_set.update(a["categories"])
        # un tiers peut cumuler plusieurs roles -> cree dans chaque table concernee
        if is_customer:
            clients.append(norm)
        if is_supplier:
            suppliers.append(norm)
        if is_prestataire:
            providers.append(norm)
        if not (is_customer or is_supplier or is_prestataire):
            clients.append(norm)  # filet de securite : client par defaut
            warnings.append(f"tiers {tid} ({norm['companyName']}): aucun flag -> client par defaut")

    referentials = {
        "categories": sorted(cat_set),
        "sites": sorted(site_set),
        "tvaModes": sorted(set(TVA_MODE.values())),
        "paymentDelays": sorted(set(PAYMENT_DELAY.values())),
        "paymentTypes": sorted(set(PAYMENT_TYPE.values())),
        "banks": sorted(set(BANK.values())),
    }

    def dump(fname, obj):
        with open(os.path.join(args.out, fname), "w", encoding="utf-8") as f:
            json.dump(obj, f, ensure_ascii=False, indent=2)

    dump("clients.json", clients)
    dump("suppliers.json", suppliers)
    dump("providers.json", providers)
    dump("referentials.json", referentials)
    with open(os.path.join(args.out, "extraction-report.txt"), "w", encoding="utf-8") as f:
        f.write(f"Tiers traites       : {len(ids)}\n")
        f.write(f"Clients             : {len(clients)}\n")
        f.write(f"Fournisseurs        : {len(suppliers)}\n")
        f.write(f"Prestataires        : {len(providers)}\n")
        f.write(f"Categories uniques  : {len(referentials['categories'])}\n")
        f.write(f"Sites uniques       : {referentials['sites']}\n")
        f.write(f"Avertissements      : {len(warnings)}\n\n")
        f.write("\n".join(warnings))

    print(f"\nTermine.")
    print(f"  clients.json      : {len(clients)}")
    print(f"  suppliers.json    : {len(suppliers)}")
    print(f"  providers.json    : {len(providers)} (prestataires)")
    print(f"  referentials.json : {len(referentials['categories'])} categories, sites {referentials['sites']}")
    print(f"  avertissements    : {len(warnings)} (voir extraction-report.txt)")
    print(f"Sortie dans : {os.path.abspath(args.out)}")


if __name__ == "__main__":
    main()