feat: write the evaluation as a parquet on-disk

Not all fields, just some for now. Signed-off-by: Raito Bezarius <masterancpp@gmail.com>
2024-09-04 17:26:43 +02:00 · 2024-09-04 17:26:43 +02:00 · 71dded6ef3
parent 64fee32cc0
commit 71dded6ef3
4 changed files with 188 additions and 3 deletions
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@ -38,6 +38,8 @@ in
      aiofiles
      pydantic-settings
      uvicorn
+      polars
+      dataclass-wizard
    ];
  };
 }
--- a/src/api/evaluation/init.py
+++ b/src/api/evaluation/init.py
--- a/src/api/evaluation/models.py
+++ b/src/api/evaluation/models.py
@ -0,0 +1,130 @@
+import json
+from dataclasses import dataclass, field
+from typing import Any
+
+from dataclass_wizard import DumpMixin, JSONWizard, LoadMixin
+
+
+@dataclass
+class MaintainerAttribute(JSONWizard):
+    name: str
+    github: str | None = None
+    github_id: int | None = None
+    email: str | None = None
+    matrix: str | None = None
+
+
+@dataclass
+class LicenseAttribute(JSONWizard):
+    full_name: str | None = None
+    deprecated: bool = False
+    free: bool = False
+    redistributable: bool = False
+    short_name: str | None = None
+    spdx_id: str | None = None
+    url: str | None = None
+
+
+@dataclass
+class MetadataAttribute(JSONWizard, LoadMixin, DumpMixin):
+    outputs_to_install: list[str] = field(default_factory=list)
+    available: bool = True
+    broken: bool = False
+    unfree: bool = False
+    unsupported: bool = False
+    insecure: bool = False
+    main_program: str | None = None
+    position: str | None = None
+    homepage: str | None = None
+    description: str | None = None
+    name: str | None = None
+    maintainers: list[MaintainerAttribute] = field(default_factory=list)
+    license: list[LicenseAttribute] = field(default_factory=list)
+    platforms: list[str] = field(default_factory=list)
+    known_vulnerabilities: list[str] = field(default_factory=list)
+
+    def __pre_as_dict__(self) -> None:
+        linearized_maintainers = []
+        for maintainer in self.maintainers:
+            if maintainer.get("scope") is not None:  # pyright: ignore generalTypeIssue
+                linearized_maintainers.extend(
+                    maintainer.get("members", [])  # pyright: ignore generalTypeIssue
+                )
+            else:
+                linearized_maintainers.append(maintainer)
+        self.maintainers = linearized_maintainers
+
+
+@dataclass
+class EvaluatedAttribute(JSONWizard):
+    """
+    This is a totally evaluated attribute.
+    """
+
+    attr: str
+    attr_path: list[str]
+    name: str
+    drv_path: str
+    # drv -> list of outputs.
+    input_drvs: dict[str, list[str]]
+    meta: MetadataAttribute | None
+    outputs: dict[str, str]
+    system: str
+
+
+@dataclass
+class PartialEvaluatedAttribute:
+    """
+    This represents a potentially invalid partially
+    evaluated attribute for some reasons.
+    Open the `evaluation` for more or read the `error`.
+    """
+
+    attr: str
+    attr_path: list[str]
+    error: str | None = None
+    evaluation: EvaluatedAttribute | None = None
+
+
+def parse_total_evaluation(raw: dict[str, Any]) -> EvaluatedAttribute:
+    # Various fixups to deal with... things.
+    # my lord...
+    if raw.get("meta", {}) is None:
+        print(raw)
+
+    if (
+        raw.get("meta", {}) is not None
+        and "license" in raw.get("meta", {})
+        and not isinstance(raw.get("meta", {})["license"], list)
+    ):
+        if raw["meta"]["license"] == "unknown":
+            raw["meta"]["license"] = []
+        elif isinstance(raw["meta"]["license"], str):
+            raw["meta"]["license"] = [{"fullName": raw["meta"]["license"]}]
+        else:
+            raw["meta"]["license"] = [raw["meta"]["license"]]
+
+    new_maintainers = []
+    if (
+        raw.get("meta", {}) is not None
+        and "maintainers" in raw.get("meta", {})
+        and isinstance(raw.get("meta", {})["maintainers"], list)
+    ):
+        for maintainer in raw.get("meta", {})["maintainers"]:
+            if maintainer.get("scope") is not None:
+                new_maintainers.extend(maintainer["members"])
+            else:
+                new_maintainers.append(maintainer)
+        raw["meta"]["maintainers"] = new_maintainers
+
+    return EvaluatedAttribute.from_dict(raw)
+
+
+def parse_evaluation_result(line: str) -> PartialEvaluatedAttribute:
+    raw = json.loads(line)
+    return PartialEvaluatedAttribute(
+        attr=raw.get("attr"),
+        attr_path=raw.get("attr_path"),
+        error=raw.get("error"),
+        evaluation=parse_total_evaluation(raw) if raw.get("error") is None else None,
+    )
--- a/src/api/main.py
+++ b/src/api/main.py
@ -1,19 +1,72 @@
-from collections.abc import AsyncGenerator
-from typing import Annotated
+import time
+from collections.abc import AsyncGenerator, Generator
+from typing import Annotated, Any

-from fastapi import FastAPI, Path
+import polars as pl
+from fastapi import FastAPI, Path, WebSocket
 from fastapi.exceptions import HTTPException
 from fastapi.responses import StreamingResponse

+from api.config import settings
 from api.evaluation import evaluation_entrypoint
+from api.evaluation.models import parse_evaluation_result

 app = FastAPI()


 async def stream_evaluation(commit_sha1: str) -> AsyncGenerator[bytes, None]:
+    rows = []
    async for lines in evaluation_entrypoint(commit_sha1):
        for line in lines:
            yield line.encode("utf8")
+            eval_result = parse_evaluation_result(line)
+            if eval_result.evaluation is not None:
+                eval_result = eval_result.evaluation
+                rows.append(
+                    [
+                        eval_result.attr,
+                        eval_result.attr_path,
+                        eval_result.name,
+                        eval_result.drv_path,
+                        eval_result.input_drvs,
+                        eval_result.system,
+                    ]
+                )
+    df = pl.DataFrame(
+        rows,
+        schema={
+            "attr": pl.String,
+            "attr_path": pl.List,
+            "name": pl.String,
+            "drv_path": pl.String,
+            "input_drvs": pl.Struct,
+            "system": pl.String,
+        },
+    )
+    df.write_parquet(f"/tmp/nixpkgs-{commit_sha1}-success-eval.parquet")
+
+
+def follow(thefile: Any) -> Generator[str, None, None]:
+    thefile.seek(0, 2)
+    while True:
+        line = thefile.readline()
+        if not line:
+            time.sleep(0.1)
+            continue
+        yield line
+
+
+@app.websocket("/logs/{project_slug}/{revision}")
+async def stream_evaluation_log(
+    websocket: WebSocket,
+    project_slug: Annotated[str, Path(title="The slug of the project, e.g. nixpkgs")],
+    revision: Annotated[str, Path(title="The SHA1 revision for this repository")],
+) -> None:
+    await websocket.accept()
+    while True:
+        with open(settings.evaluation_logs_dir / f"evaluation-{revision}.log") as log:
+            for line in follow(log):
+                await websocket.send_text(line)


@app.post("/evaluations/{project_slug}/{revision}")