diff --git a/.github/workflows/deploy-cloudflare-workers.yml b/.github/workflows/deploy-cloudflare-workers.yml index e8d44d0..e96b497 100644 --- a/.github/workflows/deploy-cloudflare-workers.yml +++ b/.github/workflows/deploy-cloudflare-workers.yml @@ -7,6 +7,12 @@ on: branches: - master +permissions: + contents: read + +concurrency: + group: cloudflare-workers-master + cancel-in-progress: true jobs: deploy: @@ -16,30 +22,32 @@ jobs: - name: Checkout uses: actions/checkout@v6 - - name: Setup PNPM - uses: pnpm/action-setup@v4 - with: - version: 10.12.1 - run_install: false - - name: Use Node.js 24 uses: actions/setup-node@v6 with: node-version-file: .nvmrc - cache: pnpm + cache: npm + cache-dependency-path: | + doesitarm-default/package-lock.json + workers/analytics/package-lock.json - name: Write Wrangler configs + shell: bash run: | - echo ${{ secrets.WRANGLER_ENV }} | base64 -d > doesitarm-default/.env - cat doesitarm-default/.env - echo ${{ secrets.WRANGLER_TOML }} | base64 -d > doesitarm-default/wrangler.toml - cat doesitarm-default/wrangler.toml - pnpm install + set -euo pipefail + umask 077 - # Analytics Worker - echo ${{ secrets.ANALYTICS_WRANGER_TOML }} | base64 -d > workers/analytics/wrangler.toml - cat workers/analytics/wrangler.toml - pnpm install + printf '%s' '${{ secrets.WRANGLER_ENV }}' | base64 --decode > doesitarm-default/.env + printf '%s' '${{ secrets.WRANGLER_TOML }}' | base64 --decode > doesitarm-default/wrangler.toml + printf '%s' '${{ secrets.ANALYTICS_WRANGER_TOML }}' | base64 --decode > workers/analytics/wrangler.toml + + - name: Install default worker dependencies + working-directory: doesitarm-default + run: npm ci + + - name: Install analytics worker dependencies + working-directory: workers/analytics + run: npm ci - name: Deploy Default Worker uses: cloudflare/wrangler-action@1.3.0 diff --git a/.gitignore b/.gitignore index 1d4390d..191b622 100644 --- a/.gitignore +++ b/.gitignore @@ -100,3 +100,5 @@ dist /README-temp.md .DS_Store /.vscode/snipsnap.code-snippets +.vercel +.env*.local diff --git a/docs/plans/app-discovery-d1-automation.md b/docs/plans/app-discovery-d1-automation.md new file mode 100644 index 0000000..addef70 --- /dev/null +++ b/docs/plans/app-discovery-d1-automation.md @@ -0,0 +1,216 @@ +# Original Prompt + +> Let's start a draft plan in docs/plans to get the app to a place where we can pull in these app data from sources automaically. +> +> We'll need to migrate all the existing and new data from Google Sheets to Cloudflare D1. +> +> We'll need to setup and run the app discovery pipeline locally on this machine. +> +> We'll need to test if we can get the app canner to be isomorphic so that we can scan from both node/bun and the browser. +> +> We'll need to audit and fix the pending app compatibility issue for the scanner. +> +> We'll need to setup an automated app discovery either on GitHub Actions or Cloudflare. +> +> Read through our docs and let me know any other work we need to add to this list + +# Goal + +Get Does It ARM to a local-first, automatable app discovery and scan pipeline that uses Cloudflare D1 as the canonical data store, can be run on this machine, can backfill legacy data from Google Sheets and other current feeds, can later run on a scheduler without breaking the existing site build, and lives inside a maintainable public monorepo with explicit app, package, database, and infrastructure boundaries. + +# Non-Goals + +- Rebuild the frontend or replace Astro/Netlify in the first pass. +- Automate every source class on day one. +- Force full browser and Node/Bun archive parity before the feasibility spike is complete. +- Remove the manual README/list flow before D1-backed equivalents exist. +- Switch package manager/runtime just because the inspiration repo uses Bun. + +# Repo Findings + +- There is already one scanner-focused plan in `docs/plans/app-test-typescript-refactor.md`, but it is narrower than this request and mainly covers Playwright safety rails plus incremental TypeScript conversion. +- The repo is still organized as a flat root with mixed app, build, worker, helper, and infra concerns rather than as a workspace monorepo. +- The app-test UI lives in `pages/apple-silicon-app-test.vue` and is mounted by `src/pages/apple-silicon-app-test.astro`. +- There are two scanner surfaces today: +- `helpers/app-files-scanner.js` is the legacy path still used by the page by default. +- `helpers/scanner/scan.ts`, `helpers/scanner/client.ts`, and `helpers/scanner/worker.ts` implement the newer worker-based scanner exposed behind `?version=2`. +- Browser coverage already exists for both scanner variants in `test/playwright/apple-silicon-app-test.playwright.ts`. +- That browser coverage is not currently green in local execution: on April 4, 2026, `pnpm exec vitest run --config vitest.playwright.config.mjs test/playwright/apple-silicon-app-test.playwright.ts` timed out waiting for the Astro dev server. +- Worker-scanner coverage already exists in `test/scanner/client.test.ts`. +- The site build still assembles static output from remote/env-backed sources rather than a local database: +- `helpers/build-app-list.js` reads `COMMITS_SOURCE`, `SCANS_SOURCE`, and `VFUNCTIONS_URL`. +- `helpers/build-homebrew-list.js` reads `HOMEBREW_SOURCE`. +- `helpers/build-game-list.js` reads `GAMES_SOURCE`. +- `helpers/build-device-list.js` reads `VFUNCTIONS_URL`. +- `build-lists.js` composes the generated app, game, homebrew, device, and video outputs from those inputs. +- `scripts/scan-new-apps.js` is not a working discovery pipeline yet. It has `runScans = false`, exits on Linux, and only spins up a local server stub. +- The repo has Cloudflare deployment automation, but not app-level D1 plumbing in source control. `.github/workflows/deploy-cloudflare-workers.yml` deploys `doesitarm-default/` and `workers/analytics/`, and writes Wrangler config from GitHub secrets at CI time. +- That same Cloudflare workflow currently prints secret-derived `.env` and `wrangler.toml` files to CI logs, which should be treated as an immediate security fix. +- There is a local `.env` with the current runtime contract, but there is no checked-in bootstrap/setup doc for another machine or another session. + +# External Research + +- Read alongside `docs/research/public-repo-security-and-monorepo-patterns-2026-04-04.md`. +- The Notion research backlog repeatedly describes the target ingestion loop as: discover source page or asset URL, download the archive, recursively extract ZIP/DMG/PKG contents, find the `.app` or Mach-O payload, scan architecture/metadata, and persist results. +- Notion still points to Homebrew as a primary source class. `Download and scan Homebrew Casks` and `Source from Homebrew Casks URLs` emphasize starting with ZIPs, then DMGs, then known extensions, while filtering already-known/native apps, batching releases, and adding timeouts/retries. +- Additional documented source classes include generic download pages, GitHub app lists and release assets, Mac App Store data, Product Hunt, Nix, and MacPorts. +- The newer `Data Source Priorities` note from May 8, 2024 still lists `Homebrew Cask Scans`, `Homebrew Formulae API`, `App Scans`, and `Product Hunt Apps/API` as important sources. +- Notion’s App Test notes map closely to the repo’s current scanner work: archive URL submission, download-page scanning, recursive extraction, and writing scan data to a store. +- I did not find explicit Playwright or agent-browser planning in Notion. That looks like a newer execution approach layered on top of the older source-discovery research rather than a documented historical plan. +- The new security/monorepo research recommends keeping the repo public, keeping secrets and raw operational data private, using a Kriasoft-style `apps/` + `packages/` + `db/` + `infra/` layout, defaulting GitHub Actions to read-only tokens and GitHub-hosted runners, and avoiding privileged `pull_request_target` flows that checkout PR code. + +# Recommendation + +- Make D1 the canonical store for discovered apps, source observations, scan runs, and site-facing aggregated records. +- Refactor toward a Kriasoft-style public monorepo before adding more discovery and infra code to the flat root. Adapt the layout, not the entire toolchain. +- Keep the repo public, but keep secrets, raw imports, downloaded archives, quarantine material, and privileged operational state private. +- Add a public-repo security-hardening stage before expanding automation: remove secret logging, tighten GitHub Actions permissions, guard workflow/infra/db paths with `CODEOWNERS`, ignore local secret/state files, and keep public CI on GitHub-hosted runners. +- Stage the work in this order: monorepo target and boundaries, public-repo security hardening, data model and D1 migration contract, scanner stabilization, local discovery pipeline, D1-backed read-path migration, then scheduled automation. +- Keep the scan core runtime-agnostic where feasible, but do not block delivery on proving identical browser and Node/Bun support for every archive type. If DMG/PKG support is impractical in the browser, make that an explicit server-side fallback instead of an accidental limitation. +- Use GitHub Actions as the first scheduled runner because the repo already has scheduled workflows and no committed D1/Wrangler local workflow yet. Keep Cloudflare as the longer-term execution target if a D1-bound ingestion worker becomes the canonical runtime. + +# Rollout Plan + +1. Define the monorepo target and carve the repository into stable boundaries. +- Design the target workspace layout in the style of `kriasoft/react-starter-kit`, adapted to the current pnpm repo: +- `apps/web/` for the Astro site and app-test UI +- `apps/default-worker/` and `apps/analytics-worker/` for current Cloudflare workers +- `apps/discovery/` for discovery/ingest entry points if a dedicated app surface is warranted +- `packages/scanner-core/`, `packages/source-runners/`, `packages/data-model/`, and `packages/site-build/` for shared logic +- `db/` for D1 migrations, seeds, import scripts, and local DB helpers +- `infra/` for Wrangler config, deployment helpers, and infra policy docs +- Keep the package manager as pnpm unless a later decision explicitly changes it. +- Decide which existing root paths move first so the refactor stays reviewable and does not block all feature work at once. + +2. Establish the public-repo security baseline before adding more automation. +- Remove secret printing from CI and rotate any secrets that may have been exposed in logs. +- Adopt a public template env pattern: +- commit a placeholder-only `.env.example` or equivalent template +- keep real credentials in ignored local files and secret managers +- ignore `.env.local`, `.dev.vars`, `.wrangler/`, and other new local-secret/state paths introduced by D1/Workers work +- Set the default `GITHUB_TOKEN` posture to read-only and grant additional permissions per job only where needed. +- Add `CODEOWNERS` or equivalent review protection for `.github/workflows/`, `infra/`, and `db/`. +- Prefer GitHub-hosted runners for public CI and scheduled jobs; do not use self-hosted runners for public PR workflows. +- Add or enable repository security defaults suitable for public OSS: secret scanning, push protection, private vulnerability reporting, dependency review, and workflow/action pinning policy. + +3. Define the source-of-truth data model and migration contract. +- Inventory every current sheet/feed that contributes app, scan, bundle, device, game, and Homebrew data. +- Define D1 tables for apps, app versions, bundles, source types, source observations, discovered assets, scan runs, import jobs, and sync checkpoints. +- Define provenance and dedupe rules for slug, bundle ID, download URL, version string, scan hash, source type, and timestamps like `first_seen_at` and `last_seen_at`. +- Write a field-mapping document from Google Sheets and current remote JSON feeds into the D1 schema. +- Decide which current static outputs remain derived snapshots and which should become D1-backed runtime/API reads. +- Add Wrangler/D1 project config, migrations, and local/staging/prod database environments. +- Build import scripts for existing Google Sheets data and any current remote feeds that must be preserved for continuity. +- Add validation output for import counts, duplicate merges, and skipped/error records. +- Document the local bootstrap on this machine: required env keys, auth/setup steps, migration commands, import commands, and reset/rollback commands. + +4. Stabilize the scanner and resolve the pending compatibility issue. +- Identify the concrete pending scanner compatibility issue, capture a reproducible sample or fixture, and turn it into an automated regression test. +- Audit the legacy `helpers/app-files-scanner.js` path against the worker scanner in `helpers/scanner/*` and decide the deprecation/default path. +- Expand scanner fixtures beyond the current happy-path native ZIP to include Intel-only, malformed, nested, DMG, and PKG examples where legally redistributable. +- Decide whether “isomorphic scanner” means one shared scan core with environment-specific file loaders or truly identical archive support in browser and Node/Bun. +- If full parity is not practical, document and test the split: browser handles `.app` and `.zip`; Node/Bun handles heavier formats like `.dmg` and `.pkg`. + +5. Build the local app discovery pipeline. +- Implement a bounded CLI or script entry point that can run locally on this machine and execute source discovery, download, extraction, scan, and D1 persistence end to end. +- Start with the highest-value documented sources: Homebrew Casks and Homebrew source URLs, then GitHub releases/lists, then generic download pages. +- Normalize all source runners into one contract: source page -> discovered asset URLs -> fetched artifact -> extracted app candidates -> scan result -> persisted record. +- Add rate limiting, retry/backoff, timeout, file-size guards, duplicate suppression, and quarantine/error buckets for failed downloads or unsupported archives. +- Persist per-run audit data so reruns can skip already-processed successes and focus on failures or changed sources. + +6. Move site and build reads onto D1-backed interfaces. +- Replace or wrap the current remote feed dependencies (`SCANS_SOURCE`, `HOMEBREW_SOURCE`, `GAMES_SOURCE`, `VFUNCTIONS_URL`, `PUBLIC_API_DOMAIN`) with D1-backed queries or exported snapshots generated from D1. +- Define how App Test submissions and automated discovery scans appear on app pages, listings, and API outputs without duplicate records. +- Introduce compatibility adapters so the site can cut over source-by-source instead of all at once. +- Verify whether any normalization currently lives behind the existing remote endpoints and port that logic before removing those dependencies. + +7. Add scheduled automation and operational controls. +- Choose the first scheduler architecture: +- GitHub Actions if the job mostly runs scripts or triggers a small ingestion endpoint. +- Cloudflare if a D1-bound worker becomes the canonical ingestion runtime. +- Add a staging/dry-run lane before production writes. +- Emit run summaries for discovered apps, downloaded assets, successful scans, failed scans, D1 writes, duplicates skipped, and retries exhausted. +- Add pause/resume controls per source plus playbooks for reruns, backfills, and bad-import recovery. +- If Cloudflare becomes the later execution target, evaluate Workflows/Queues plus Workers Logs/Traces as the durable background-processing and observability surface. + +# Validation Gates + +- Monorepo scaffolding is internally consistent: +- workspace install succeeds +- shared packages resolve from the intended workspace paths +- moved apps still build from their new locations +- Security baseline changes are reviewed and verified: +- CI no longer prints secret-derived config to logs +- workflow permissions are explicitly declared +- protected paths (`.github/workflows/`, `infra/`, `db/`) have review ownership +- Existing scanner coverage passes: +- `pnpm exec vitest run test/scanner/client.test.ts` +- `pnpm exec vitest run --config vitest.playwright.config.mjs test/playwright/apple-silicon-app-test.playwright.ts` +- Site/build health passes after read-path changes: +- `pnpm typecheck` +- `pnpm build` +- `pnpm test` +- Migration validation artifact exists and is reviewed: +- source row counts vs D1 row counts +- duplicate/merge report for slugs, bundle IDs, versions, and scan hashes +- sample record spot checks across apps, scans, Homebrew, devices, and games +- Local discovery dry run succeeds against a bounded batch: +- at least 10 Homebrew apps +- at least 3 non-Homebrew direct-download pages +- persisted run summary saved to a reviewable artifact +- Automation validation succeeds in staging: +- one scheduled run completes +- rerunning the same batch is idempotent +- no uncontrolled duplicate writes are produced + +# Deliverables + +- A repo-local execution plan in `docs/plans/app-discovery-d1-automation.md` +- A repo-local research memo in `docs/research/public-repo-security-and-monorepo-patterns-2026-04-04.md` +- A target monorepo layout and migration sequence for the current flat-root repo +- A public-repo security baseline for workflows, secrets, and local operational state +- D1 schema and migration design +- Local bootstrap documentation for discovery plus D1 +- Import/backfill scripts and validation report shape +- Scanner audit and regression-coverage plan +- Local discovery pipeline plan and runnable entry point +- Scheduled automation decision with staging rollout path + +# Risks And Open Questions + +- The pending scanner compatibility issue is not named in repo docs, so the first task in that stage is to identify the exact failing app/archive and capture it as a test case. +- Browser-safe DMG/PKG support may not be realistic. Forcing parity could delay delivery more than an explicit split runtime. +- The monorepo refactor can easily sprawl if it tries to move every surface at once; the first slice should focus on boundaries that unblock scanner, D1, and discovery work. +- The current site mixes manual README content, build-time remote feeds, and app-test scan data. The D1 cutover boundary needs an explicit decision. +- Generic site downloads and third-party source scraping need a clear policy for rate limits, robots/TOS, timeouts, and file-size caps. +- Existing remote endpoints may contain normalization logic that is not visible in this repo. That logic has to be audited before those feeds are replaced. +- Cloudflare deployment exists today, but D1 local/prod workflow is not checked in. Local reproducibility depends on adding that missing surface. +- Cloudflare deployment may continue to require long-lived tokens in GitHub Actions even after hardening, so environment scoping and review gates matter. +- If the worker scanner becomes default, the legacy scanner should not remain as an unowned fallback indefinitely. + +# Sources + +- `docs/research/public-repo-security-and-monorepo-patterns-2026-04-04.md` +- `docs/plans/app-test-typescript-refactor.md` +- `docs/app-flow.md` +- `build-lists.js` +- `helpers/build-app-list.js` +- `helpers/build-homebrew-list.js` +- `helpers/build-game-list.js` +- `helpers/build-device-list.js` +- `helpers/app-files-scanner.js` +- `helpers/scanner/scan.ts` +- `helpers/scanner/client.ts` +- `test/playwright/apple-silicon-app-test.playwright.ts` +- `test/scanner/client.test.ts` +- `.github/workflows/deploy-api.yaml` +- `.github/workflows/deploy-cloudflare-workers.yml` +- `.github/workflows/deploy-frontend.yaml` +- `.github/workflows/deploy-functions.yaml` +- Notion: Download and scan Homebrew Casks +- Notion: Source from Homebrew Casks URLs +- Notion: Source from generic App/Download page links +- Notion: App Tester +- Notion: Get App Test Working in Node +- Notion: Detect and decompressed ZIP, DMG, and PKG +- Notion: Data Source Priorities diff --git a/docs/plans/app-test-typescript-refactor.md b/docs/plans/app-test-typescript-refactor.md new file mode 100644 index 0000000..71165d5 --- /dev/null +++ b/docs/plans/app-test-typescript-refactor.md @@ -0,0 +1,49 @@ +# Original Prompt + +> OK, this is really great. I've been wanting to do a TypeScript conversion for this repo for a while. Tell me about that. We also need to... I want-- we have a-- the app test feature. Every time I try to touch that code, it gets fragile. So build a playwright test to verify that feature so that way we can scan apps, and then get that working. and then let's start a refactor. But once you have that working and verified, Let's start to refactor to get this converted all to TypeScript. + +# Goal + +Lock the Apple Silicon app-test flow with an end-to-end browser test, fix regressions in the real scan/upload path, and begin the TypeScript conversion with small, reviewable changes around the browser-test and scanner surface. + +# Non-Goals + +- Full repo-wide JavaScript-to-TypeScript conversion in one pass. +- Replacing the scan engine implementation without test coverage first. +- Changing user-facing app-test behavior beyond what is needed to make the feature reliable. + +# Repo Findings + +- The app-test UI is implemented in [pages/apple-silicon-app-test.vue](/Users/athena/Code/doesitarm/pages/apple-silicon-app-test.vue) and mounted by [src/pages/apple-silicon-app-test.astro](/Users/athena/Code/doesitarm/src/pages/apple-silicon-app-test.astro). +- The current browser-test harness exists, but only covers Pagefind in [test/playwright/pagefind-native-filter.playwright.js](/Users/athena/Code/doesitarm/test/playwright/pagefind-native-filter.playwright.js). +- The app-test flow depends on archive extraction, plist parsing, Mach-O parsing, and an HTTP POST to `TEST_RESULT_STORE` via [helpers/app-files-scanner.js](/Users/athena/Code/doesitarm/helpers/app-files-scanner.js). +- A newer worker-based scanner path exists behind `?version=2`, but the production page still defaults to the legacy path. + +# Decision + +Add a deterministic Playwright upload test that scans a generated zipped `.app` bundle against the real page, stub only the remote result-store POST, and use that as the safety rail before starting TypeScript refactors. + +# Rollout Plan + +1. Add typed Playwright support for spinning up Astro and generating a known-good app archive fixture. +2. Add a browser test for `/apple-silicon-app-test/` that uploads the fixture, intercepts the result-store request, and asserts the rendered native result. +3. Fix app-test regressions exposed by the browser test. +4. Start the TypeScript conversion with the new Playwright support layer and continue into the scanner path in later passes. + +# Validation Gates + +- `pnpm test:browser test/playwright/apple-silicon-app-test.playwright.ts` +- `pnpm test:browser` +- Manual smoke check of `/apple-silicon-app-test/` if the browser test exposes timing or hydration issues + +# Deliverables + +- A Playwright browser test covering the app-test upload and scan flow +- Any app-test fixes required to make that test pass +- Initial TypeScript refactor scaffolding in the browser-test/scanner-adjacent path + +# Risks And Open Questions + +- The legacy scanner depends on zip and Mach-O parsing behavior in the browser, so fixture choice needs to stay minimal and deterministic. +- The repo still mixes `.js`, `.mjs`, `.ts`, `.vue`, and `.astro`, so conversion order matters; scanner-adjacent modules should move only after coverage exists. +- The worker-based scanner path likely needs separate follow-up coverage before it can replace the legacy path. diff --git a/docs/plans/cloudflare-dual-deploy-shadow.md b/docs/plans/cloudflare-dual-deploy-shadow.md new file mode 100644 index 0000000..4cbf9f2 --- /dev/null +++ b/docs/plans/cloudflare-dual-deploy-shadow.md @@ -0,0 +1,198 @@ +# Original Prompt + +> Okay, I've got, we do have environment variables locally, and we may have the same ones on Netlify. Let's go ahead and Build a plan to set up this dual deploy to Cloudflare to use the local environment variables and see how that works. Cloudflare, the desertarm.com is on Cloudflare, so we can just set up the subdomain and it should work. + +# Goal + +Set up a reversible dual-deploy path where the same repo and same commit can be deployed to both Netlify and Cloudflare, with Netlify remaining primary during the migration and Cloudflare serving a shadow subdomain for parity testing, environment validation, and incremental runtime cleanup. + +# Non-Goals + +- Cut production traffic over to Cloudflare in the first pass. +- Migrate all site data dependencies to D1 as part of the initial dual-deploy spike. +- Remove Netlify-specific build and deploy paths before Cloudflare parity exists. +- Introduce a separate runtime or package-manager change just because the deployment target changes. + +# Repo Findings + +- The app currently uses the Netlify adapter in `astro.config.mjs` via `@astrojs/netlify`. +- Astro only uses one adapter per build, so dual deploy will require two build configurations rather than one config that targets both platforms simultaneously. +- Astro CLI supports `--config`, which makes a dual-build setup practical without splitting the repo immediately. +- Current environment keys present in local `.env` are: +- `ALL_UPDATE_SUBSCRIBE` +- `BUILD_ID` +- `COMMITS_SOURCE` +- `GAMES_SOURCE` +- `GOOGLE_API_KEY` +- `HOMEBREW_SOURCE` +- `PUBLIC_API_DOMAIN` +- `PUBLIC_URL` +- `SCANS_SOURCE` +- `TEST_RESULT_STORE` +- `URL` +- `VFUNCTIONS_URL` +- `VIDEO_SOURCE` +- The current frontend/build path is still Netlify-shaped: +- `astro.config.mjs` uses `@astrojs/netlify` +- `netlify.toml` defines redirects and build behavior +- `helpers/astro/request.js` calls `helpers/config-node.js`, which reads `netlify.toml` from disk at runtime +- `package.json` scripts are still centered on `netlify-build` +- Existing workflows are split across deploy hooks for functions/frontend and a separate Cloudflare worker deploy lane. +- There is already a Cloudflare zone for `doesitarm.com`, so attaching a Worker or custom domain to a subdomain should be straightforward once the Worker build exists. + +# External Research + +- Astro’s deployment model is adapter-based. Official docs indicate one adapter per build, and the CLI supports `--config `, which is the cleanest way to run a Netlify build and a Cloudflare build from the same repo. +- Cloudflare recommends `wrangler.jsonc` for new projects and supports named environments under `env.`. +- Cloudflare local development can load secrets from either `.dev.vars` or `.env`, but Cloudflare explicitly says to choose one or the other rather than relying on both at once. +- Cloudflare supports environment-specific `.env.` files and merges `.env` files by precedence during local development. +- Cloudflare Workers can be attached to a subdomain either by a route or by `custom_domain`, and custom domains are the preferred fit when the Worker is the origin for that subdomain. +- Astro’s Cloudflare adapter exposes Cloudflare environment variables and bindings through the Worker runtime, and current docs also support importing environment bindings from `cloudflare:workers`. + +# Recommendation + +- Use a dual-build, dual-deploy model from the same commit: +- Netlify remains the primary production deploy target. +- Cloudflare gets a shadow deploy on a subdomain such as `cf-preview.doesitarm.com` or `edge-preview.doesitarm.com`. +- Create a dedicated Cloudflare Astro config rather than trying to make one config branch internally on host. +- Reuse the existing local root `.env` for the first local Cloudflare spike only if there is no `.dev.vars` in play, because Cloudflare supports `.env`-based local loading. +- Treat that reuse as transitional. The durable state should be: +- `.env.example` for committed placeholders +- `.env` or `.env.local` for current app-local development +- Cloudflare-specific `.env.cloudflare`, `.env.staging`, or `.dev.vars` strategy chosen explicitly +- Wrangler-managed secrets for deployed Cloudflare environments +- Keep the first Cloudflare deployment reading the same external sources that Netlify uses today. Do not bundle D1 into the first shadow deploy unless it is required for the Cloudflare app to boot. + +# Rollout Plan + +1. Define the dual-deploy shape and naming. +- Pick the shadow subdomain and document its purpose. +- Decide whether Cloudflare shadow traffic uses a route or `custom_domain`. +- Decide the first dual-deploy branch policy: +- deploy both on `master` +- or deploy Cloudflare only on a dedicated branch/tag until parity work stabilizes +- Define success criteria for “Cloudflare shadow is viable” before any production cutover discussion. + +2. Split Astro configuration into host-specific build targets. +- Keep the existing Netlify config as the current baseline. +- Add a Cloudflare-specific Astro config file that swaps `@astrojs/netlify` for `@astrojs/cloudflare`. +- Add build scripts that make the host target explicit, for example: +- `build:netlify` +- `build:cloudflare` +- `preview:cloudflare` +- Keep shared Vite/integration logic in shared modules so the configs differ only where platform behavior truly differs. + +3. Define the environment-variable contract for the Cloudflare shadow. +- Inventory each current env key by purpose: +- public site URL values +- external data-source URLs +- tokens/secrets +- build-only values +- runtime values +- Decide which values will live in Wrangler `vars` versus Wrangler secrets. +- For the first local spike, allow Wrangler to load from the existing root `.env` if the config stays at the repo root and there is no `.dev.vars`. +- After the spike, decide and document the long-term local convention: +- continue using `.env` for shared local values +- or move Cloudflare-specific sensitive values to `.dev.vars` +- Add a checked-in env template artifact so Cloudflare setup is reproducible without copying the real `.env`. +- Ensure no secret-bearing values are printed by CI or committed into Wrangler config. + +4. Make the runtime path adapter-neutral enough for shadow deploy. +- Remove runtime dependency on `netlify.toml` from the request path. +- Extract redirects into a platform-neutral source of truth that both Netlify and Cloudflare can consume. +- Audit any assumptions that rely on Netlify SSR behavior, filesystem layout, or deploy-hook conventions. +- Confirm whether `helpers/url.js` and public runtime config are still correct under the Cloudflare adapter. + +5. Add a local Cloudflare development lane. +- Add Wrangler config and local run instructions. +- Add a local dev command that exercises the Cloudflare target with the current env contract. +- Verify that the app boots locally on the Cloudflare target using local env values without editing secrets into source control. +- Add binding/type generation if the Cloudflare adapter path needs `wrangler types`. + +6. Add CI dual deploy without cutting over traffic. +- Keep existing Netlify deploy flow intact. +- Add a Cloudflare build-and-deploy workflow from the same commit SHA. +- Deploy the Cloudflare build to the shadow subdomain. +- Keep the Cloudflare workflow isolated from public PR execution until secrets and permissions are hardened. +- Make the deploy output report the Netlify URL, Cloudflare shadow URL, commit SHA, and environment used. + +7. Add parity checks for the shadow environment. +- Run smoke checks against both Netlify and Cloudflare deployments. +- Verify: +- homepage +- one dynamic app page +- one formula page +- app-test page +- redirects +- static search assets +- Compare response behavior, major console/runtime errors, and critical page content between the two hosts. +- Log differences in a durable artifact so the cleanup work is visible before cutover. + +8. Decide the cutover gate. +- Define the minimum parity bar: +- stable deploys +- no runtime-only Cloudflare failures +- environment values mapped cleanly +- redirects equivalent +- key pages verified +- Only after that, decide whether to: +- switch DNS/subdomain roles +- proxy production traffic through Cloudflare +- or continue Netlify as origin while more data-layer migration work lands + +# Validation Gates + +- Local Cloudflare target boots successfully with local env values and no committed secrets. +- Both host-specific Astro builds complete from the same commit. +- Cloudflare shadow subdomain serves the app over the Cloudflare zone. +- Redirect behavior is equivalent on a representative sample of known redirects. +- Core routes render on both hosts: +- `/` +- one `/app/...` +- one `/formula/...` +- `/apple-silicon-app-test/` +- CI output for Cloudflare deploy contains no secret material. +- Netlify remains healthy throughout the shadow rollout. + +# Deliverables + +- A focused dual-deploy plan in `docs/plans/cloudflare-dual-deploy-shadow.md` +- A target naming decision for the shadow subdomain +- Host-specific Astro build configs and scripts plan +- An explicit Cloudflare env/secrets mapping plan based on current local keys +- A runtime-neutralization checklist for Netlify-specific request/config logic +- A parity verification checklist for Netlify vs Cloudflare + +# Risks And Open Questions + +- The current runtime read of `netlify.toml` is the biggest likely blocker for “works the same on both hosts.” +- Reusing the current root `.env` for the first Cloudflare spike is practical, but it may blur long-term ownership unless a dedicated Cloudflare local env convention is chosen quickly. +- Some current env keys may be build-time only or tied to Netlify/external APIs; they should not all be assumed to map 1:1 to Cloudflare runtime secrets. +- If the Cloudflare shadow deploy uses the same backend sources as Netlify, host parity is easier but data-layer migration remains deferred. +- If the Cloudflare deploy is allowed to drift from the Netlify build graph, the comparison loses value. +- The existing Cloudflare worker workflow currently prints secret-derived files to logs; this must be treated as a blocker to any secret-bearing Cloudflare app deploy workflow. + +# Sources + +- `astro.config.mjs` +- `package.json` +- `.github/workflows/deploy-cloudflare-workers.yml` +- `.github/workflows/deploy-frontend.yaml` +- `.github/workflows/deploy-functions.yaml` +- `netlify.toml` +- `helpers/astro/request.js` +- `helpers/config-node.js` +- Cloudflare Wrangler configuration docs: + https://developers.cloudflare.com/workers/wrangler/configuration/ +- Cloudflare environments docs: + https://developers.cloudflare.com/workers/wrangler/environments/ +- Cloudflare local environment variables docs: + https://developers.cloudflare.com/workers/configuration/environment-variables/ +- Cloudflare custom domains docs: + https://developers.cloudflare.com/workers/configuration/routing/custom-domains +- Astro CLI docs: + https://docs.astro.build/en/reference/cli-reference/ +- Astro Cloudflare adapter docs: + https://docs.astro.build/en/guides/integrations-guide/cloudflare/ +- Astro Netlify adapter docs: + https://docs.astro.build/en/guides/integrations-guide/netlify/ diff --git a/docs/plans/public-discoverability-and-dataset-plan.md b/docs/plans/public-discoverability-and-dataset-plan.md new file mode 100644 index 0000000..1dd72ad --- /dev/null +++ b/docs/plans/public-discoverability-and-dataset-plan.md @@ -0,0 +1,329 @@ +# Original Prompt + +> Audit this app for recommend changes from this research ranked by highest leverage. Score each for ease of implementation. +> +> Okay, let's put all of these into a plan. Let's put the easiest to implement towards the top, but also let's include in the plan at every step to research what we're about to do and look for anything helpful, anything that would be helpful for the implementation. + +# Goal + +Turn `doesitarm` into a stronger public compatibility knowledge product by +improving its discoverability, machine-readable surfaces, and evidence quality, +while keeping the rollout ordered by easiest implementation first and requiring +a fresh research pass before each implementation stage. + +# Non-Goals + +- Rebuild the entire site or search experience in the first pass. +- Move the full discovery/ingestion pipeline into D1 as part of this plan. +- Publish raw crawls, downloaded binaries, or private operational data. +- Treat `llms.txt` or any single AI-specific surface as the primary strategy. +- Optimize for every possible search feature before the canonical app pages are + stronger for humans. + +# Repo Findings + +- App, formula, and game detail pages all flow through the same basic listing + template: + [src/pages/app/[...appPath].astro](/Users/athena/Code/doesitarm/src/pages/app/[...appPath].astro), + [src/pages/formula/[...formulaPath].astro](/Users/athena/Code/doesitarm/src/pages/formula/[...formulaPath].astro), + [src/pages/game/[...gamePath].astro](/Users/athena/Code/doesitarm/src/pages/game/[...gamePath].astro), + [src/components/default-listing.astro](/Users/athena/Code/doesitarm/src/components/default-listing.astro). +- Those pages are still closer to status/listing pages than evidence pages. + The main body currently emphasizes the status line, related links, device + support, videos, bundles, and a generic last-updated footer, but not a clear + "how we know" or methodology layer. +- The head system is centralized in + [helpers/config-node.js](/Users/athena/Code/doesitarm/helpers/config-node.js), + but it does not currently emit canonical links and uses generic descriptions + for listing pages. +- The listing head model is centralized in + [helpers/listing-page.js](/Users/athena/Code/doesitarm/helpers/listing-page.js), + but the current title/description logic is generic and partially hardcoded. +- Structured data support exists, but it is effectively video-only: + [helpers/listing-page.js](/Users/athena/Code/doesitarm/helpers/listing-page.js) + builds JSON-LD for videos and benchmark pages, and + [helpers/config-node.js](/Users/athena/Code/doesitarm/helpers/config-node.js) + only emits JSON-LD for `tv` and `benchmarks` routes. +- The app pages already have a useful public JSON surface under `static/api/`, + but there is no explicit dataset landing page or formal public snapshot + surface documented for humans and machines. +- The build already generates sitemaps and a search index: + [build-lists.js](/Users/athena/Code/doesitarm/build-lists.js). +- `robots.txt` is currently a blanket allow with only a sitemap declaration: + [static/robots.txt](/Users/athena/Code/doesitarm/static/robots.txt). +- I did not find a checked-in `llms.txt`, `llms-full.txt`, or public plain-text + / Markdown export surface. + +# External Research + +- Use + [docs/research/desktop-app-compatibility-data-strategy-2026-04-04.md](/Users/athena/Code/doesitarm/docs/research/desktop-app-compatibility-data-strategy-2026-04-04.md) + as the strategy anchor for this plan. +- Google’s current guidance says AI search eligibility still largely depends on + normal Search fundamentals, which means the public app pages and metadata + still matter more than any AI-only affordance. +- Google’s helpful-content, spam-policy, and reviews guidance implies that + thin template pages are risky, while evidence-backed, comparison-friendly, and + user-helpful pages are more durable. +- OpenAI and Anthropic now expose separate search, training, and user-triggered + crawlers, which makes bot-specific crawl policy a practical implementation + surface. +- `llms.txt` and AI-friendly Markdown exports appear useful as secondary + surfaces, but not as the primary lever. +- Dataset markup is useful only when the site actually publishes a real dataset + landing page and download surface. + +# Decision + +Execute this work in ease-first order, not leverage-first order. + +Rationale: + +- The highest-leverage item is the evidence-page upgrade, but it is not the + easiest thing to implement cleanly. +- A sequence of smaller early wins can improve crawl quality, metadata, and + machine-readable surfaces before the larger content-model work lands. +- Each stage should begin with a small research/inspection pass so the + implementation reflects current docs, current schema requirements, and any + repo changes that happened after this plan was written. + +# Rollout Plan + +## 1. Strengthen head metadata and canonical URL support + +- Research and confirm: + review current Astro/head implementation, Google guidance for canonical URLs, + and whether any existing route types need exceptions before editing the head + system. +- Implement: + add canonical links, ensure `og:url` / social URL values use the canonical + page URL, and upgrade page title/description generation so app pages include + more useful fact signals than the current generic copy. +- Scope: + [helpers/config-node.js](/Users/athena/Code/doesitarm/helpers/config-node.js), + [helpers/listing-page.js](/Users/athena/Code/doesitarm/helpers/listing-page.js), + and the page entrypoints that feed `headOptions`. +- Stop condition: + the site emits canonical links and noticeably better per-page metadata without + changing page structure yet. + +## 2. Add a public methodology layer and improve source labeling on app pages + +- Research and confirm: + review Google’s helpful-content and review-style guidance again, then inspect + the current app page layout to identify the smallest meaningful “how we know” + addition that improves user trust without requiring a full template rewrite. +- Implement: + add a methodology page or methodology section, link to it from app pages, and + stop hiding source context behind generic labels such as `View`. +- Implement: + expose clearer source/evidence language on app pages, especially around: + official source, verification source, last verified, and update/report flows. +- Scope: + [src/components/default-listing.astro](/Users/athena/Code/doesitarm/src/components/default-listing.astro), + [src/components/listing-parts/related-links.astro](/Users/athena/Code/doesitarm/src/components/listing-parts/related-links.astro), + [src/components/listing-parts/last-updated.astro](/Users/athena/Code/doesitarm/src/components/listing-parts/last-updated.astro), + plus a new methodology page in `src/pages/`. +- Stop condition: + every app page has a clearer “how we know” path even before the full + evidence-page upgrade lands. + +## 3. Add a bot-policy matrix and update crawl controls + +- Research and confirm: + re-check current OpenAI and Anthropic crawler docs plus any search-engine bot + requirements immediately before changing crawler policy. +- Implement: + define the intended policy for: + `Googlebot`, `OAI-SearchBot`, `GPTBot`, `ChatGPT-User`, + `Claude-SearchBot`, `ClaudeBot`, and `Claude-User`. +- Implement: + update `robots.txt` accordingly and add a short repo-local note documenting + the reasoning so the bot policy is not rediscovered from scratch later. +- Scope: + [static/robots.txt](/Users/athena/Code/doesitarm/static/robots.txt) and a + small supporting doc under `docs/` or `docs/plans/`. +- Stop condition: + the site has explicit AI/search crawler policy rather than a wildcard default. + +## 4. Add `llms.txt` and a lightweight plain-text export surface + +- Research and confirm: + review the latest `llms.txt` / `llms-full.txt` conventions and current + examples before implementing anything. Re-check whether there are now better + conventions for docs maps, plain-text exports, or per-page Markdown. +- Implement: + add a minimal `llms.txt` that points to the most useful public surfaces: + homepage, categories, methodology, dataset/download page, and canonical app + surfaces. +- Implement: + decide the smallest useful plain-text export surface: + either a global `llms-full.txt`, a compact machine-readable summary, or a + small number of high-value Markdown/text views. +- Scope: + `static/` and whichever build helpers are needed to produce a stable + low-cost text surface. +- Stop condition: + the project exposes a minimal AI-friendly discovery surface without yet + attempting a full text-export architecture. + +## 5. Expand structured data beyond videos + +- Research and confirm: + re-read current Google structured-data docs and Schema.org guidance before + implementation to avoid stale or invalid markup assumptions. +- Implement: + remove the current “video-only emit” restriction from the head system where + appropriate and add truthful app-level structured data for app, formula, and + game pages. +- Implement: + keep markup aligned with visible page content. Do not invent commerce fields + or unsupported review fields just to chase rich results. +- Scope: + [helpers/config-node.js](/Users/athena/Code/doesitarm/helpers/config-node.js), + [helpers/listing-page.js](/Users/athena/Code/doesitarm/helpers/listing-page.js), + [helpers/structured-data.js](/Users/athena/Code/doesitarm/helpers/structured-data.js), + plus tests around structured-data output. +- Stop condition: + app-like pages emit valid, truthful JSON-LD, while video pages keep their + existing structured-data support. + +## 6. Publish a dataset/download surface + +- Research and confirm: + re-check Google Dataset guidance, `Dataset` / `DataDownload` expectations, + and practical licensing/provenance requirements before publishing anything as + a dataset. +- Implement: + add a dataset landing page that explains what can be downloaded, how often it + updates, what fields are included, and how the data should be cited or used. +- Implement: + promote one or more existing JSON outputs into formal public snapshot assets, + and decide whether CSV should also be emitted in the first pass. +- Scope: + [build-lists.js](/Users/athena/Code/doesitarm/build-lists.js), new pages in + `src/pages/`, and any helper code needed to expose a stable snapshot surface. +- Stop condition: + the site offers a real public dataset entrypoint instead of only implicit API + files. + +## 7. Upgrade canonical app pages from status pages to evidence pages + +- Research and confirm: + inspect the live app-page composition, compare it with the research memo’s + recommendations, and review any new examples of strong evidence-heavy product + or compatibility pages before rewriting the template. +- Implement: + extend app pages so they include a clearer evidence summary, environment + distinctions, verification method, source attribution, and a more explicit + “what changed” or “status history” path where data exists. +- Implement: + define the smallest new content blocks that materially increase usefulness: + status summary, evidence summary, environment/workaround summary, and change + notes. +- Scope: + [src/components/default-listing.astro](/Users/athena/Code/doesitarm/src/components/default-listing.astro) + plus new listing-part components and any supporting listing helpers. +- Stop condition: + the canonical app pages answer more of the real user question than the + current template does. + +## 8. Reduce index bloat and make indexing intentional + +- Research and confirm: + re-read Google crawling/indexing guidance before touching indexability rules, + especially around internal search, faceted navigation, and low-value pages. +- Implement: + decide which surfaces should remain indexable by default: + canonical app pages, category pages, dataset pages, and high-intent + comparison pages. +- Implement: + identify low-value or confusing surfaces that may need `noindex`, canonical + consolidation, or sitemap exclusion. +- Scope: + [build-lists.js](/Users/athena/Code/doesitarm/build-lists.js), + [helpers/config-node.js](/Users/athena/Code/doesitarm/helpers/config-node.js), + search/category pages, and any sitemap/index-generation helpers. +- Stop condition: + the site’s crawl/index policy reflects page value instead of “everything is + indexable because it exists.” + +# Validation Gates + +- Metadata/canonical stage: + inspect page source for `/`, one `/app/...`, one `/formula/...`, one + `/game/...`, and one `/tv/...` page and confirm canonical plus improved title + and description output. +- Methodology/source-label stage: + manually review one app page and confirm that evidence/source links are more + legible and that a methodology path is present. +- Bot-policy stage: + validate generated `robots.txt` content against the intended bot matrix and + confirm the file still advertises the sitemap. +- `llms.txt` stage: + fetch and inspect the generated files and confirm they point to live public + surfaces that are actually useful. +- Structured-data stage: + extend or unskip structured-data tests and validate one app page plus one + video page. +- Dataset stage: + confirm the landing page exists, download links work, and the documented + schema matches the shipped snapshot fields. +- Evidence-page stage: + manually review several representative pages: + native app, non-native app, formula, and game. +- Index-bloat stage: + inspect page source, sitemap output, and any `noindex` directives to confirm + only intended pages are affected. +- End-to-end gate: + `pnpm build` + `pnpm test` + plus manual review of rendered HTML on representative routes. + +# Deliverables + +- A repo-local implementation plan in + [public-discoverability-and-dataset-plan.md](/Users/athena/Code/doesitarm/docs/plans/public-discoverability-and-dataset-plan.md) +- Improved page metadata and canonical handling +- A methodology surface linked from public pages +- An explicit crawler-policy matrix and updated `robots.txt` +- `llms.txt` and a lightweight AI-friendly text surface +- App-level structured data beyond videos +- A dataset/download landing page and public snapshot surface +- Stronger evidence-oriented app pages +- A more intentional indexing strategy + +# Risks And Open Questions + +- Ease-first order is not identical to leverage-first order. The plan delays the + largest content-model improvement so smaller foundation changes can land + first. +- The current public JSON shape may not be sufficient for a strong evidence-page + upgrade without adding new derived fields or exposing more of the existing + data model. +- Bot-policy decisions are partly product decisions, not only technical ones. + The safest implementation is not necessarily the best business decision. +- `llms.txt` conventions are still moving, so the implementation should remain + lightweight and easy to change. +- Dataset publication raises questions about license, attribution, rate limits, + and whether CSV/Parquet should be included now or later. +- Structured data can easily drift away from visible page truth if the page + templates and data model are not upgraded together. +- Search/category pages may currently be valuable enough to keep indexed; the + “index bloat” stage needs traffic and usefulness review before it removes + anything from search. + +# Sources + +- [docs/research/desktop-app-compatibility-data-strategy-2026-04-04.md](/Users/athena/Code/doesitarm/docs/research/desktop-app-compatibility-data-strategy-2026-04-04.md) +- [src/pages/app/[...appPath].astro](/Users/athena/Code/doesitarm/src/pages/app/[...appPath].astro) +- [src/pages/formula/[...formulaPath].astro](/Users/athena/Code/doesitarm/src/pages/formula/[...formulaPath].astro) +- [src/pages/game/[...gamePath].astro](/Users/athena/Code/doesitarm/src/pages/game/[...gamePath].astro) +- [src/components/default-listing.astro](/Users/athena/Code/doesitarm/src/components/default-listing.astro) +- [src/components/listing-parts/related-links.astro](/Users/athena/Code/doesitarm/src/components/listing-parts/related-links.astro) +- [src/components/listing-parts/last-updated.astro](/Users/athena/Code/doesitarm/src/components/listing-parts/last-updated.astro) +- [helpers/config-node.js](/Users/athena/Code/doesitarm/helpers/config-node.js) +- [helpers/listing-page.js](/Users/athena/Code/doesitarm/helpers/listing-page.js) +- [helpers/structured-data.js](/Users/athena/Code/doesitarm/helpers/structured-data.js) +- [build-lists.js](/Users/athena/Code/doesitarm/build-lists.js) +- [static/robots.txt](/Users/athena/Code/doesitarm/static/robots.txt) diff --git a/docs/plans/repo-typescript-migration.md b/docs/plans/repo-typescript-migration.md new file mode 100644 index 0000000..cf9c1a6 --- /dev/null +++ b/docs/plans/repo-typescript-migration.md @@ -0,0 +1,151 @@ +# Original Prompt + +> Let's fix the cloudflare. +> +> I also want to completely refactor this repo to TypeScript without breaking CI. +> +> Do we have a full e2e test for App Scanning? + +# Goal + +Move the repo from a mostly JavaScript codebase to a TypeScript-first codebase without breaking CI, production deploys, or the existing app-scanning behavior, while keeping each migration slice small enough to review and roll back. + +# Non-Goals + +- Rewrite the whole repo to TypeScript in one commit. +- Change scanner or site behavior just to satisfy typing. +- Replace Astro, Vue, pnpm, or Netlify/Cloudflare as part of the migration itself. +- Force every legacy worker/helper package onto modern tooling before the type migration proves out. + +# Repo Findings + +- The repo is still overwhelmingly JavaScript-led: +- 195 code files across `.ts`, `.js`, `.mjs`, `.vue`, and `.astro` +- 12 TypeScript files, about 6.2% of code files +- 1,802 TypeScript lines out of 20,721 total code lines, about 8.7% by line count +- The scanner/app-test surface is now the strongest typed foothold in the repo: +- `helpers/scanner/client.ts` +- `helpers/scanner/scan.ts` +- `helpers/scanner/worker.ts` +- `test/scanner/client.test.ts` +- The browser harness is already typed and now protects the main app-scanning flow: +- `test/playwright/apple-silicon-app-test.playwright.ts` +- `test/playwright/pagefind-native-filter.playwright.ts` +- App-scanning coverage is strong for the happy path, but not exhaustive: +- There is a real browser upload E2E for both the legacy scanner and the worker scanner via `?version=2`. +- That test stubs the result-store POST, so it is not a full database/backend integration test. +- There is a direct worker-scanner test using a generated `.app.zip` fixture. +- Negative fixtures and heavier archive cases like DMG/PKG are still missing. +- The page layer still uses JavaScript/Vue options API in `pages/apple-silicon-app-test.vue`, even though the scanner internals beneath it are now typed. +- The repo has mixed runtime shapes: +- old `.js` helper modules +- some `.mjs` server/build modules +- `.vue` and `.astro` files with little or no embedded TypeScript +- Cloudflare worker subprojects under `doesitarm-default/` and `workers/analytics/` still use old JavaScript toolchains and should be treated as separate migration surfaces. + +# Decision + +Do the migration as a staged refactor with explicit CI gates after each slice. Keep the scanner/app-test lane as the proving ground, then expand outward to helper/build modules, then UI/runtime surfaces, then worker subprojects. Prefer converting boundary-stable modules first and avoid cross-repo churn until tests and deploys stay green. + +# Rollout Plan + +1. Lock the migration baseline and CI contract. +- Treat `pnpm run typecheck`, `pnpm run test`, and `pnpm run test:browser` as the minimum green gate for repo-level migration slices. +- Keep production smoke checks for app scanning and Pagefind in the validation ladder when changes touch scanner, app-test, or search. +- Do not merge large TypeScript batches without passing the same gates that currently protect production deploys. + +2. Finish the scanner-adjacent TypeScript lane before broadening scope. +- Convert the remaining scanner internals and helper modules that sit directly below the typed worker surface: +- `helpers/scanner/parsers/macho.js` +- `helpers/scanner/parsers/plist.js` +- `helpers/scanner/file-api.js` +- Add negative scanner fixtures alongside the existing happy-path test: +- decompression failure +- missing `Info.plist` +- missing Mach-O +- non-native architecture result +- Keep the app-test browser E2E green for both the default path and `?version=2`. + +3. Make the worker scanner the default path, then delete the legacy scanner. +- Switch the app-test page to use the typed worker scanner path by default. +- Keep the browser test protecting the route while that cutover happens. +- Once the default path is stable, remove the old `helpers/app-files-scanner.js` legacy implementation rather than maintaining two divergent scanner stacks. +- This is the biggest simplification available before broader TypeScript conversion. + +4. Convert runtime config, URL, and shared helper modules. +- Migrate shared helper modules that are reused by pages and builds but have relatively stable behavior: +- `helpers/public-runtime-config.mjs` +- `helpers/url.js` +- `helpers/check-types.js` +- `helpers/environment.js` +- `helpers/config-node.js` +- Prefer module-by-module conversion with targeted regression coverage instead of umbrella “helpers” commits. + +5. Convert build and list-generation modules in bounded slices. +- Move the build pipeline from mixed `.js`/`.mjs` to typed modules in sequence: +- list builders (`helpers/build-*.js`) +- API/static builders +- scripts under `scripts/` +- `build-lists.js` +- Keep `pnpm netlify-build` as the main verification gate for this stage, because these modules affect the deploy artifact more than the browser UI. + +6. Convert Vue and Astro surfaces after their underlying helpers are typed. +- Start with high-risk pages that already have browser tests: +- `pages/apple-silicon-app-test.vue` +- search surfaces touched by the Pagefind regression +- Then move into other Vue components and Astro pages incrementally. +- Avoid converting large groups of Vue components in one pass unless they share the same typed props/state model. + +7. Migrate the Cloudflare worker subprojects as a separate workstream. +- Treat `doesitarm-default/` and `workers/analytics/` as isolated packages with their own runtime/toolchain constraints. +- Convert them after the main site/scanner path is stable. +- Keep their CI/deploy workflow green independently of the main Astro site migration. + +# Validation Gates + +- For scanner/app-test changes: +- `pnpm run typecheck` +- `pnpm exec vitest run test/scanner/client.test.ts` +- `pnpm run test:browser` +- production smoke on `https://doesitarm.com/apple-silicon-app-test/` + +- For helper/build changes: +- `pnpm run typecheck` +- `pnpm run test` +- `pnpm netlify-build` + +- For search-related changes: +- `pnpm run typecheck` +- `PLAYWRIGHT_BASE_URL=https://doesitarm.com pnpm run test:browser:pagefind` + +- For worker-subproject changes: +- the relevant worker install/build path still succeeds +- the Cloudflare deploy workflow remains green + +# Deliverables + +- A staged TypeScript migration plan in `docs/plans/repo-typescript-migration.md` +- A bounded migration order that preserves CI and production safety +- A rule for when it is safe to remove the legacy scanner path +- Expanded scanner fixture coverage beyond the current native happy path +- A clear split between main-site migration work and Cloudflare worker migration work + +# Risks And Open Questions + +- The fastest way to “convert everything” would create too much churn and likely destabilize CI; the staged path is slower but safer. +- Browser-safe archive support may continue to diverge from Node/Bun-safe archive support for formats like DMG and PKG. +- Legacy helper modules may hide runtime assumptions that only show up in the full Netlify build path. +- Vue options-API files can become noisy when typed mechanically; they should be migrated only after props/state boundaries are clear. +- The Cloudflare subprojects use older JS tooling and may need their own migration plan if they resist the main repo conventions. +- “100% TypeScript” may not be worth pursuing literally for generated glue or tiny legacy worker entrypoints if the last few files cost more risk than value. + +# Sources + +- `docs/plans/app-test-typescript-refactor.md` +- `docs/plans/app-discovery-d1-automation.md` +- `test/playwright/apple-silicon-app-test.playwright.ts` +- `test/scanner/client.test.ts` +- `helpers/scanner/client.ts` +- `helpers/scanner/scan.ts` +- `helpers/scanner/worker.ts` +- `pages/apple-silicon-app-test.vue` diff --git a/docs/research/desktop-app-compatibility-data-strategy-2026-04-04.md b/docs/research/desktop-app-compatibility-data-strategy-2026-04-04.md new file mode 100644 index 0000000..0084a7e --- /dev/null +++ b/docs/research/desktop-app-compatibility-data-strategy-2026-04-04.md @@ -0,0 +1,600 @@ +# Desktop App Compatibility Data Strategy For doesitarm + +Tease: In 2026, the winning play is not "AI SEO." It is publishing a high-trust, machine-readable compatibility corpus that is genuinely useful on the open web and selectively keeping the operational and proprietary layers private. + +Lede: For `doesitarm` on 2026-04-04, the best-fit strategy is to treat the site as an entity-and-evidence graph for desktop software compatibility: publish canonical app pages, provenance-rich evidence, structured exports, and selective machine-readable surfaces for discovery; keep raw crawls, binary artifacts, candidate matches, scoring logic, and operational intelligence private. + +Why it matters: +- This project can outlive the Apple Silicon transition if the core model is "desktop software compatibility knowledge," not just "Apple Silicon list posts." +- Google's 2025 AI search guidance still rewards the same fundamentals: unique content, crawlable pages, textual clarity, and trustworthy evidence, not special AI-only tricks. +- OpenAI and Anthropic now expose separate search, user-action, and training bots, which means "open versus closed" is no longer binary. You can choose visibility, training access, and operational exposure separately. + +Go deeper: +- Think of the public site as a citation layer and decision-support layer, not as the full warehouse. +- Publish public facts, provenance, timestamps, and curated exports. Keep raw ingestion, low-confidence candidates, and monetizable workflow intelligence private. +- Treat `llms.txt` and Markdown exports as helpful secondary surfaces, not as the core strategy. The core strategy is still clean HTML, canonical URLs, structured data, sitemaps, and useful pages. + +Date: 2026-04-04 + +## Scope + +Research how to think about a long-lived desktop app compatibility database as a +content, SEO, and AI-discoverability system in 2026, including: + +- best practices for public content architecture +- how LLM-driven discovery changes the picture +- what data should likely stay public versus private +- what audiences this data can serve +- tradeoffs between more-open and more-closed approaches + +## Short Answer + +Build `doesitarm` as a public knowledge product with a private operating system +underneath it. + +Publicly, publish: + +- canonical app pages +- compatibility status by platform/environment +- evidence summaries and source links +- timestamps, changelogs, and history +- stable IDs, taxonomy, and machine-readable metadata +- a limited public API or snapshot exports for high-value reuse + +Privately, keep: + +- raw crawls and downloaded binaries +- candidate entities before review +- normalization, dedupe, and confidence logic +- crawler logs, abuse rules, and infrastructure controls +- enrichment that creates monetizable leverage rather than user value on the open web + +The biggest strategic shift from 2018 to 2026 is this: + +1. Search still rewards useful original pages. +2. AI discovery mostly rides on those same pages. +3. Separate crawler controls now let you be open for search while staying more closed for training. +4. The moat is less "having any compatibility data at all" and more: + verification quality, provenance, freshness, historical depth, and workflow speed. + +Inference: +No single source states that exact four-part conclusion. It is the synthesis that +best fits the repo state plus current Google, OpenAI, Anthropic, Cloudflare, +HN, and Lobsters evidence. + +## What The Repo Already Knows + +- The project already acts like a compatibility corpus, not just a blog: + [README.md](/Users/athena/Code/doesitarm/README.md) is a manually curated, + source-linked compatibility list. +- The repo already has a plan to move toward a canonical database and discovery + pipeline: + [docs/plans/app-discovery-d1-automation.md](/Users/athena/Code/doesitarm/docs/plans/app-discovery-d1-automation.md). +- The public site already exposes crawlable pages, a sitemap, and permissive + crawling: + [static/robots.txt](/Users/athena/Code/doesitarm/static/robots.txt), + [static/sitemap-index.xml](/Users/athena/Code/doesitarm/static/sitemap-index.xml). +- The current public JSON already exposes useful app-level fields such as name, + aliases, status, bundle IDs, related links, scan details, and device support: + [static/api/app/spotify.json](/Users/athena/Code/doesitarm/static/api/app/spotify.json). +- The current structured data implementation is narrow and video-centric: + [helpers/structured-data.js](/Users/athena/Code/doesitarm/helpers/structured-data.js), + [helpers/listing-page.js](/Users/athena/Code/doesitarm/helpers/listing-page.js), + [helpers/config-node.js](/Users/athena/Code/doesitarm/helpers/config-node.js). +- I did not find a checked-in `llms.txt`, `llms-full.txt`, or per-page Markdown + export surface. +- I also did not find `SoftwareApplication` or `Dataset` structured data on app + or dataset pages. + +Inference: +`doesitarm` already has enough public data shape to become a strong +machine-readable corpus. The main gap is not "inventing the dataset." The gap is +formalizing and publishing the right layers of it. + +## What The Evidence Says + +### 1. Google AI search still wants normal SEO fundamentals, not special AI tricks + +Google's current AI-features guidance says there are no extra technical +requirements for AI Overviews or AI Mode beyond normal Search eligibility. +Google explicitly says you do not need new AI files or special schema just to +appear in AI features. + +What does matter: + +- crawl access +- internal links +- page experience +- important content in textual form +- structured data matching visible text +- unique, non-commodity content + +This is the strongest argument against building an "AI discoverability" strategy +around gimmicks alone. + +### 2. Large-scale thin template pages are a real risk + +Google's helpful-content and spam-policy guidance is directly relevant to +programmatic compatibility sites: + +- people-first content is favored +- pages made mainly to attract search visits are a warning sign +- scaled content abuse includes generating many low-value pages, including from + feeds or automated transformations + +That means a compatibility database can absolutely win in search, but only if +its pages add decision-making value. Thin pages that just restate a status field +are dangerous. + +### 3. Compatibility content should look more like tested reviews than like directory filler + +Google's reviews guidance is a good proxy for compatibility pages because users +often arrive with a purchase, migration, or workflow decision in mind. + +The guidance consistently rewards: + +- original research +- first-hand evidence +- quantitative measurements where relevant +- comparisons +- what changed across versions +- benefits and drawbacks + +For `doesitarm`, that maps cleanly onto: + +- status by environment +- last verified date +- evidence links +- scanner output or screenshots where appropriate +- "what changed" changelog notes +- comparison pages like native vs Rosetta vs virtualization vs cloud workaround + +### 4. Dataset markup is useful, but it should describe real dataset landing pages + +Google's dataset documentation recommends canonical landing pages plus dataset +metadata such as `sameAs`, `isBasedOn`, identifiers, license, and download +distribution metadata. + +That is a strong fit for curated exports such as: + +- a public daily or weekly compatibility snapshot +- a historical archive by date +- vendor- or category-specific exports +- a Windows-on-ARM or future-transition slice later on + +Important nuance: +Google's dataset docs are about Dataset Search discovery, not a substitute for +general web SEO. Dataset markup helps when you actually publish datasets. + +### 5. `SoftwareApplication` markup fits the entity model, but Google rich-result requirements are narrower + +Schema.org's `SoftwareApplication` type supports fields that are very relevant +here, including: + +- `applicationCategory` +- `downloadUrl` +- `featureList` +- `operatingSystem` +- `softwareRequirements` +- `softwareVersion` +- `supportingData` + +Google also has a software-app structured-data feature, but its rich-result +requirements are more commerce-shaped, including `offers.price` and +review/rating support. That means: + +- use `SoftwareApplication` semantics where they match the visible page truth +- do not invent store-like fields just to chase rich results +- use dataset markup for exports and software/entity markup for canonical app + pages + +### 6. AI discoverability is now bot-by-bot, not one global yes/no + +OpenAI and Anthropic both now distinguish between different AI access modes. + +OpenAI: + +- `OAI-SearchBot` is for search inclusion +- `GPTBot` is for training +- `ChatGPT-User` is for user-triggered actions + +Anthropic: + +- `ClaudeBot` is for training +- `Claude-SearchBot` is for search quality +- `Claude-User` is for user-triggered retrieval + +This is strategically important. You no longer need to choose only between: + +- fully public for every AI purpose +- fully blocked for every AI purpose + +You can allow discovery while disallowing training, or allow search while +tightly managing user-action access, depending on your goals. + +### 7. `llms.txt` is real, but it is still a secondary signal + +Cloudflare has implemented `llms.txt`, `llms-full.txt`, and per-page Markdown +exports, and Simon Willison has highlighted similar docs-map patterns as useful +for agent tooling. + +That said: + +- Google explicitly says no special AI text files are required for AI features +- OpenAI's discoverability guidance focuses on crawler access, `noindex`, and + citation/linking, not `llms.txt` +- HN and Lobsters discussions show real skepticism around AI crawler incentives + and how consistently emerging conventions are respected + +Best interpretation: + +- `llms.txt` is worth adding because it is cheap and increasingly recognized +- it should not be treated as the core lever +- the core lever is still strong public pages plus clean machine-readable + content + +### 8. AI-friendly plain-text and Markdown surfaces do have practical value + +Cloudflare's docs work here is the clearest practical example: + +- per-page Markdown versions +- an index file +- bulk text export +- semantic HTML +- `noindex` on low-value or confusing pages + +This is less about search ranking and more about: + +- making retrieval cheaper and more accurate for agents +- improving citation quality +- reducing token waste +- giving your own future agents and partners a stable ingest format + +For a compatibility corpus, that suggests public Markdown or JSON exports are +worth doing for the canonical facts layer. + +### 9. Freshness and URL discovery matter more as the corpus grows + +Google recommends sitemaps and Search Console monitoring. +IndexNow gives faster change pings for engines that support it, including Bing. + +For a frequently updated compatibility corpus, this argues for: + +- canonical landing pages +- clean sitemap generation +- changelog feeds or update streams +- optional IndexNow support for faster non-Google discovery + +### 10. The crawl environment is getting more adversarial + +Cloudflare Radar reported AI and search crawling growth of 18% from May 2024 to +May 2025 across its measured cohort, with `GPTBot` up 305%. +HN and Lobsters operator discussions show why this matters in practice: + +- some AI crawlers create real infrastructure cost +- incentives are less aligned than classic web search +- operators increasingly need bot-specific controls, rate limiting, and + selective exposure + +This is the best evidence for keeping raw and high-cost surfaces private even if +you lean more open on the public facts layer. + +## Ways This Data Can Create Value + +### Human audiences + +- End users deciding whether they can keep using a favorite app on new hardware. +- IT, procurement, and upgrade planners deciding when a transition is safe. +- Developers and vendors tracking native support gaps and competitive pressure. +- Journalists and analysts covering platform transitions. +- Researchers and historians studying how ecosystems adapt to hardware changes. + +### Machine audiences + +- Search engines indexing canonical app, category, and comparison pages. +- LLM search products citing your pages as evidence. +- RAG systems consuming public snapshots or APIs. +- Agents answering migration, procurement, or troubleshooting questions. +- Internal `doesitarm` automation using the same canonical public layer as a + stable reference surface. + +### Business-model value + +- Audience growth from high-intent compatibility queries. +- Affiliate or sponsored monetization on truly decision-support pages. +- Paid APIs, bulk exports, or enterprise dashboards. +- Vendor intelligence and alerting. +- Historical transition data as a differentiated research asset. + +Inference: +The public facts are likely to commoditize over time. The durable value is the +combination of breadth, freshness, provenance, history, and tooling layered on +top of those facts. + +## What Should Likely Stay Public + +Public-by-default fields: + +- stable app identifier and canonical URL +- app name, aliases, vendor, category, platform family +- compatibility status by environment +- environment dimensions such as CPU architecture, OS family/version, native + vs translation vs virtualization +- bundle IDs and installer/package metadata where safe and user-useful +- last verified date, first seen date, last changed date +- public evidence summary and source links +- changelog summary for status changes +- category and comparison pages built from real user tasks +- curated JSON, CSV, or Parquet snapshot exports +- public structured data and sitemaps + +Public page types that seem high-value: + +- canonical app pages +- category pages +- "best alternatives if not native yet" pages +- transition pages such as "best native DAWs on Apple Silicon" +- comparison pages by use case, hardware generation, and workaround path +- dataset landing pages for bulk exports + +## What Should Likely Stay Private + +Private-by-default fields: + +- raw crawled HTML and downloaded ZIP/DMG/PKG artifacts +- extracted binaries and quarantine samples +- low-confidence matches and candidate entities +- dedupe, normalization, and scoring heuristics +- reviewer notes, moderation notes, and dispute state +- crawler logs, IP intelligence, WAF rules, and abuse signatures +- affiliate economics, contact records, outreach state, and deal terms +- internal confidence models, embeddings, and experimental feature engineering +- unpublished source mappings and scrape recipes that are costly to build + +Why keep these private: + +- operational risk +- legal and hosting risk +- abuse resistance +- clearer moat +- lower copyability + +## Different Ways To Think About The Database + +### 1. Directory / programmatic SEO system + +Upside: +- fastest traffic growth if executed well + +Downside: +- easiest to drift into thin pages and scaled-content abuse +- weakest long-term moat + +Use this frame only if every template answers a real question better than a +generic directory would. + +### 2. Public knowledge graph with evidence + +Upside: +- strongest fit for search, citations, and trust +- best long-term reuse across Apple, Windows, and future transitions + +Downside: +- requires stronger data modeling and provenance discipline + +This is the best framing for `doesitarm`. + +### 3. Public publication layer over a private intelligence system + +Upside: +- best balance of discoverability and defensibility +- easiest path to enterprise/API products later + +Downside: +- more operational complexity + +This is the recommended operating model. + +### 4. Mostly closed database with selective public summaries + +Upside: +- strongest direct control over assets + +Downside: +- weakest SEO and AI discoverability +- hardest to build brand authority from the data itself + +This makes sense only if monetization depends more on closed workflows than on +being the public authority. + +## Open Vs Closed Strategy Options + +## Option 1. Open facts, private operations + +Publish: + +- canonical pages +- evidence summaries +- limited exports +- structured data + +Keep private: + +- raw ingestion +- candidate pipeline +- scoring and ops + +Tradeoff: +Best overall balance of discoverability, trust, and defensibility. + +## Option 2. Open pages, paid API / paid bulk data + +Publish: + +- strong pages for discovery and citations +- free lightweight API or delayed snapshots + +Charge for: + +- real-time API +- higher limits +- historical depth +- enterprise filters and alerts + +Tradeoff: +Strong monetization path, but requires clearer product packaging. + +## Option 3. Fully open data commons + +Publish: + +- everything except unsafe raw binaries/secrets + +Tradeoff: +Maximum goodwill, citation, and reuse. +Minimum moat unless monetization shifts to services, sponsorship, or community +leadership. + +## Option 4. Selective access / crawler monetization layer + +Publish: + +- normal web pages + +Control: + +- which bots crawl +- whether training is allowed +- whether some crawlers must pay + +Tradeoff: +Promising middle path, especially as crawler monetization standards mature, but +still early and not something to build the whole strategy around yet. + +## Recommendation + +For `doesitarm`, use Option 1 now, with a path to Option 2 later. + +Concretely: + +1. Treat the database as transition-agnostic. + Use dimensions like `platform_family`, `cpu_arch`, `translation_layer`, + `virtualization_layer`, `os_version`, `artifact_type`, and + `verification_method` so the same model can cover Apple Silicon, Windows on + ARM, or the next Apple transition. + +2. Build a public canonical facts layer. + Each app should have a canonical page with: + status, environments, timestamps, evidence links, and short synthesis. + +3. Build a public dataset layer. + Publish periodic snapshots with dataset landing pages, license, provenance, + versioning, and download metadata. + +4. Keep ingestion and raw evidence private. + Store raw downloads, scrape traces, matching logic, and low-confidence + candidates outside the public repo and public site. + +5. Add public machine-readable surfaces in this order: + - `SoftwareApplication`-style entity markup where it truthfully matches page content + - dataset landing pages plus `Dataset` / `DataDownload` metadata for exports + - stable JSON or CSV snapshots + - `llms.txt` and Markdown exports as secondary aids + +6. Make public pages citation-friendly. + Add clear authorship, methodology, "how we know", last verified date, and + source links. + +7. Avoid index bloat. + Keep canonical entity and high-intent comparison pages indexable. + Use `noindex` or crawl controls for low-value filter permutations and stale + or confusing pages. + +8. Measure before deciding how open to be. + Track: + - Search Console web traffic + - ChatGPT referral traffic via `utm_source=chatgpt.com` + - bot traffic by user agent + - crawl cost versus referral value + +Inference: +The best long-term moat is not withholding all facts. It is being the most +trusted and most reusable source for those facts, while keeping the expensive +and differentiating machinery private. + +## Near-Term Next Steps For doesitarm + +1. Add a public data-contract document describing the canonical app entity, + environment entity, evidence entity, and snapshot dataset. +2. Expand app pages from "status page" to "evidence page": + include methodology, last verified date, change history, and source + attribution. +3. Add structured data intentionally: + entity markup for app pages, dataset markup for exports, not generic markup + everywhere. +4. Add a public snapshot export and a dataset landing page. +5. Add a bot-policy matrix to `robots.txt` planning: + Google search, OpenAI search, Anthropic search, training bots, and user bots. +6. Add `llms.txt` only after the public canonical and export layers are clean. +7. Keep filters/search-result pages from becoming the primary indexable surface. + +## Source Links + +- Repo context: + - [README.md](/Users/athena/Code/doesitarm/README.md) + - [docs/plans/app-discovery-d1-automation.md](/Users/athena/Code/doesitarm/docs/plans/app-discovery-d1-automation.md) + - [static/robots.txt](/Users/athena/Code/doesitarm/static/robots.txt) + - [helpers/structured-data.js](/Users/athena/Code/doesitarm/helpers/structured-data.js) + - [helpers/listing-page.js](/Users/athena/Code/doesitarm/helpers/listing-page.js) + - [helpers/config-node.js](/Users/athena/Code/doesitarm/helpers/config-node.js) + - [static/api/app/spotify.json](/Users/athena/Code/doesitarm/static/api/app/spotify.json) + +- Google AI features and AI search: + - https://developers.google.com/search/docs/appearance/ai-features + - https://developers.google.com/search/blog/2025/05/succeeding-in-ai-search + - https://developers.google.com/search/docs/fundamentals/creating-helpful-content + - https://developers.google.com/search/docs/essentials/spam-policies + - https://developers.google.com/search/docs/fundamentals/using-gen-ai-content + +- Google review and structured-data guidance: + - https://developers.google.com/search/docs/appearance/reviews-system + - https://developers.google.com/search/docs/specialty/ecommerce/write-high-quality-reviews + - https://developers.google.com/search/docs/appearance/structured-data/software-app + - https://developers.google.com/search/docs/appearance/structured-data/dataset + - https://developers.google.com/search/docs/appearance/structured-data/sd-policies + - https://developers.google.com/search/docs/crawling-indexing/crawling-managing-faceted-navigation + - https://developers.google.com/search/docs/crawling-indexing/block-indexing + +- Schema and dataset modeling: + - https://schema.org/SoftwareApplication + +- OpenAI: + - https://help.openai.com/en/articles/12627856-publishers-and-developers-faq + - https://help.openai.com/en/articles/9237897-chatgpt-search + - https://platform.openai.com/docs/gptbot + +- Anthropic: + - https://support.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler + - https://docs.anthropic.com/en/docs/build-with-claude/search-results + +- Cloudflare: + - https://developers.cloudflare.com/style-guide/how-we-docs/ai-consumability/ + - https://blog.cloudflare.com/from-googlebot-to-gptbot-whos-crawling-your-site-in-2025/ + - https://blog.cloudflare.com/introducing-pay-per-crawl/ + +- Discovery and freshness: + - https://www.indexnow.org/index + +- Practitioner and discussion context: + - https://simonwillison.net/2025/Oct/24/claude-code-docs-map/ + - https://news.ycombinator.com/item?id=41072549 + - https://lobste.rs/s/dmuad3/mitigating_sourcehut_s_partial_outage + +## Source Quality Notes + +- Google Search Central, OpenAI, Anthropic, Schema.org, IndexNow, and Cloudflare + were the primary sources for current guidance. +- The HN and Lobsters links were useful for operator sentiment and failure modes, + not as primary authority for ranking behavior. +- `llms.txt` appears real and increasingly implemented, but the strongest + current evidence still says it is supplemental rather than foundational. diff --git a/docs/research/private-public-repo-sync-patterns-2026-04-04.md b/docs/research/private-public-repo-sync-patterns-2026-04-04.md new file mode 100644 index 0000000..da63dee --- /dev/null +++ b/docs/research/private-public-repo-sync-patterns-2026-04-04.md @@ -0,0 +1,222 @@ +# Private/Public Repo Sync Patterns For doesitarm + +Tease: Git can push to a second remote, but it does not natively maintain a safe long-lived branch that "merges everything except `docs/`." + +Lede: For `doesitarm` on 2026-04-04, the best-fit pattern is a second repo/remote plus an automated one-way export from `origin/master` that rewrites out `docs/` and force-pushes the result. + +Why it matters: +- `docs/` is already present on `origin/master`, so excluding it from a future mirror does not make it private retroactively. +- `docs/` does not appear to participate in the app build, which makes export-time exclusion low-risk. +- Cross-repo automation on GitHub needs separate auth; the default workflow token is not enough for another private repo. + +Go deeper: +- Simple one-way sync: fresh clone -> `git filter-repo --path docs/ --invert-paths` -> force-push to a second remote. +- Faster repeated projection: evaluate `splitsh-lite` if export speed or repeatability becomes important. +- Advanced bidirectional filtered views: `josh` is the serious option, but it is heavier than this repo likely needs. + +Date: 2026-04-04 + +## Scope + +Research whether `doesitarm` should support a `private-main`-style branch or +remote that automatically tracks the public default branch while excluding +paths such as `docs/`, and identify better patterns if they exist. + +## Short Answer + +Yes, you can push to a different remote from the public repo. + +No, the durable pattern is not a long-lived `private-main` merge branch that +keeps deleting `docs/` after every merge. Git branches and merges are full-tree +operations, and sparse checkout does not change that. + +For this repo, the cleanest pattern is: + +1. Keep `origin/master` as the source branch. +2. Add a second remote or second repository for the private target. +3. Run a one-way export job on each push to `master`. +4. In that job, create a sanitized tree/history with `docs/` removed. +5. Force-push the sanitized result to the private remote branch. + +If the real requirement is that future docs stay private, the better topology +is the reverse: keep the canonical repo private and generate the public export. + +Inference: +That last recommendation is based on the current repo state: `docs/` is already +committed on `origin/master`, so a private mirror without `docs/` only changes +future distribution, not past public exposure. + +## What The Repo Already Knows + +- The default remote today is only `origin`, pointed at the public GitHub repo. +- The default tracked branch is `origin/master`, not `main`. +- There is no checked-in `.github/` workflow that already handles cross-repo + sync. +- `docs/` currently contains repo-local planning and research material: + [docs/app-flow.md](/Users/athena/Code/doesitarm/docs/app-flow.md), + [docs/plans/app-test-typescript-refactor.md](/Users/athena/Code/doesitarm/docs/plans/app-test-typescript-refactor.md), + and dated research memos under + [docs/research](/Users/athena/Code/doesitarm/docs/research). +- `docs/` is already present in `origin/master` history as of 2026-04-04. +- The build surface in + [package.json](/Users/athena/Code/doesitarm/package.json) + does not appear to depend on `docs/`. + +## What The Evidence Says + +- Different remote from the public repo: + yes. Git supports separate remotes, and the `git remote` docs explicitly say + that when fetch and push use different locations, you should use two separate + remotes rather than pretending they are the same remote. +- Long-lived branch with path exclusions: + not a native Git capability. Git merges operate on full trees, not + "everything except these directories." +- Sparse checkout: + not the answer here. The `git sparse-checkout` docs describe it as a working + directory reduction feature, and they note that operations such as merge or + rebase may still materialize paths outside the sparse specification. +- `git filter-repo`: + good fit for one-way export. The Git project now recommends it instead of + `filter-branch`, and its docs support `--invert-paths` for "keep everything + except these paths" rewrites. That matches "mirror the repo except `docs/`." +- `splitsh-lite`: + promising when you want repeatable projections into standalone repos and care + about performance. Its README supports split prefixes that can include + exclusions and uses a history cache, which is more appropriate than a manual + merge branch when this becomes a repeated sync lane. +- `josh`: + the advanced option. Its repo describes a proxy Git server that exposes + filtered histories as standalone repos and synchronizes between original and + filtered views. This is the closest thing to a "real" selective mirror + system, but it adds operational weight. +- GitHub Actions auth: + the default `github.token` is scoped to the current repository. If a workflow + in the public repo needs to push to a different private repo, you need a PAT, + deploy key, or GitHub App token instead. + +## What Works + +- A second remote or second repository for the private target. +- A one-way generated branch or repo, not a hand-maintained merge branch. +- Rebuilding the private export from `origin/master` every run. +- Treating the private mirror as generated output with force-pushes allowed. +- Keeping development on one source branch and one source-of-truth repo. + +## What To Avoid + +- Do not maintain `private-main` by repeatedly merging `master` and deleting + `docs/`. That creates unnecessary churn and eventual conflict debt. +- Do not use sparse checkout as if it were a publishing filter. +- Do not make the generated private mirror a peer source of truth unless you + also adopt a projection system designed for bidirectional sync. +- Do not rely on the default GitHub Actions token for pushes to another repo. +- Do not assume this setup hides `docs/` historically; those files are already + in the public remote history. + +## Best Patterns + +## 1. Best Fit For This Repo: one-way export to a second remote + +Use a second remote, for example `private`, pointing at a separate private +GitHub repository. On each push to `origin/master`, run an automation that: + +1. checks out `master` +2. authenticates to the private repo with a PAT, deploy key, or GitHub App +3. creates a fresh export clone or export worktree +4. rewrites out `docs/` and any other excluded paths +5. force-pushes the sanitized result to the private repo branch + +Why this is the best fit: + +- it matches the repo's current single-source workflow +- it does not depend on path-aware merges that Git does not have +- it keeps excluded-path logic in one place +- it is easy to reason about and recover from + +Tradeoffs: + +- exported commit SHAs will differ from public `master` +- the private mirror should be treated as generated/read-only + +## 2. Better If Repeated Projection Becomes Core: `splitsh-lite` + +If you end up publishing multiple filtered mirrors or need fast repeated +updates, `splitsh-lite` is worth a spike. It is built for turning repository +views into standalone histories and caching the work. + +Tradeoffs: + +- more specialized operational knowledge +- less obvious to future maintainers than a simple export script + +## 3. Better Only For Advanced Bidirectional Partial Views: `josh` + +If the real requirement becomes "developers commit through filtered views and +changes synchronize both directions," `josh` is the pattern to study. + +Tradeoffs: + +- significant infrastructure/runtime overhead +- far more complexity than `doesitarm` appears to need today + +## 4. Adjacent But Not The Same Problem: GitHub Private Mirrors + +GitHub's `private-mirrors` app is relevant if the goal is to collaborate +privately around a public repository and upstream later. It is not the right +answer for "same repo minus `docs/`," but it is worth noting as a neighboring +pattern. + +## Recommendation + +For `doesitarm`, use a separate private repository plus a generated sync job. +Name the target branch after the actual default branch in this repo, for +example `private-master` or simply `master` on the private remote. + +Do not implement this as a merge branch. + +If the aim is just "same code, different remote, minus `docs/`," a generated +one-way mirror is the right level of machinery. + +If the aim is "keep future internal docs private," move the source of truth to +a private repo and generate the public mirror from that private origin. + +## Missing Information + +- Whether the private target is intended to be read-only/generated or whether + anyone will commit directly to it. +- Whether `docs/` is the only excluded path or just the first example. +- Whether the real goal is secrecy, deployment hygiene, or private-only + collaboration before publishing. + +## Source Links + +- Git remote docs: + https://git-scm.com/docs/git-remote +- Git sparse-checkout docs: + https://git-scm.com/docs/git-sparse-checkout +- `git-filter-repo` repository: + https://github.com/newren/git-filter-repo +- `git-filter-repo` manual: + https://www.mankier.com/1/git-filter-repo +- `splitsh-lite` repository: + https://github.com/splitsh/lite +- `josh` repository: + https://github.com/josh-project/josh +- `actions/checkout` README: + https://github.com/actions/checkout +- GitHub App auth in GitHub Actions: + https://docs.github.com/en/enterprise-cloud@latest/apps/creating-github-apps/authenticating-with-a-github-app/making-authenticated-api-requests-with-a-github-app-in-a-github-actions-workflow +- GitHub deploy keys: + https://docs.github.com/v3/guides/managing-deploy-keys +- GitHub Private Mirrors app: + https://github.com/github-community-projects/private-mirrors +- Stack Overflow, partial sharing of Git repositories: + https://stackoverflow.com/questions/278270/partial-sharing-of-git-repositories + +## Source Quality Notes + +- HN and Lobsters searches on 2026-04-04 did not surface a clearly better + mainstream pattern than the Git/GitHub docs plus the specialized projection + tools above. +- Primary docs and project READMEs were materially more useful than forum + commentary for this question. diff --git a/docs/research/public-repo-security-and-monorepo-patterns-2026-04-04.md b/docs/research/public-repo-security-and-monorepo-patterns-2026-04-04.md new file mode 100644 index 0000000..8bf5bf3 --- /dev/null +++ b/docs/research/public-repo-security-and-monorepo-patterns-2026-04-04.md @@ -0,0 +1,396 @@ +# Public Repo Security And Monorepo Patterns For doesitarm + +Tease: The safest version of this plan keeps `doesitarm` public, but treats credentials, imports, downloaded app artifacts, and privileged automation as private operational surfaces. + +Lede: For `doesitarm` on 2026-04-04, the best-fit pattern is a Kriasoft-style public monorepo with clear `apps/`, `packages/`, `db/`, and `infra/` boundaries, plus hardened GitHub Actions, GitHub-hosted runners for public workflows, D1 local development via Wrangler, and private storage for secrets, backups, and quarantined artifacts. + +Why it matters: +- The current repo is about to add higher-risk surfaces: D1, automated app discovery, archive downloading, scheduled jobs, and more Cloudflare automation. +- In a public repo, CI/CD mistakes matter as much as application code mistakes. Workflow files, tokens, logs, and runner choices become part of the threat model. +- The current repo already has one immediate security problem: a workflow prints secret-derived files to CI logs. + +Go deeper: +- Keep the code public; keep secrets, raw data, and operational state private. +- Refactor toward a monorepo shape early so new ingestion, scanner, D1, and infra code do not spread across a flat root. +- Adopt OSS-friendly GitHub hardening: read-only default `GITHUB_TOKEN`, pinned actions, CODEOWNERS on workflow/infra/db paths, secret scanning, private vulnerability reporting, and no self-hosted runners for public PRs. + +Date: 2026-04-04 + +## Scope + +Research security considerations and common open-source repository patterns for a +setup like `doesitarm`: + +- public GitHub repository +- Cloudflare Workers and D1 +- scheduled automation +- automated downloading and scanning of third-party app archives +- prospective monorepo refactor in the style of + `kriasoft/react-starter-kit` + +This memo is intended to drive updates to +[app-discovery-d1-automation.md](/Users/athena/Code/doesitarm/docs/plans/app-discovery-d1-automation.md). + +## Short Answer + +Do not move the whole repo private. + +Instead: + +1. Keep the application and infrastructure code public. +2. Move secrets, imported raw data, D1 operational state, downloaded artifacts, + quarantined samples, and any sensitive fixtures to private systems. +3. Refactor into a monorepo early, using a Kriasoft-style structure adapted to + this repo's existing pnpm/Netlify/Astro/Workers setup. +4. Harden GitHub Actions before expanding automation. + +Best-fit recommendation: + +- Public monorepo with `apps/`, `packages/`, `db/`, `infra/`, `scripts/`, + and `docs/` +- GitHub-hosted runners for public workflows +- GitHub environment secrets with required reviewers for production deploys +- Cloudflare D1 local development and tests via Wrangler `--local`, + `preview_database_id`, and test harnesses like `unstable_dev()`/Miniflare +- Private object storage or equivalent for raw app archives, import dumps, + and quarantine material + +Inference: +This is the right fit because the repo is open source and community-facing, but +the risky parts are operational, not architectural. Public code is compatible +with good security here; public credentials and public operational data are not. + +## What The Repo Already Knows + +- The repo is currently flat-rooted, not organized as a workspace monorepo. +- There is no checked-in D1 configuration or local D1 bootstrap yet. +- There is Cloudflare deployment automation in + [deploy-cloudflare-workers.yml](/Users/athena/Code/doesitarm/.github/workflows/deploy-cloudflare-workers.yml). +- That workflow currently decodes secret-backed `.env` / `wrangler.toml` files + and prints them with `cat`, which is a real security issue in CI logs. +- The site build still depends on remote/env-backed feeds such as + `SCANS_SOURCE`, `COMMITS_SOURCE`, `HOMEBREW_SOURCE`, `GAMES_SOURCE`, and + `VFUNCTIONS_URL`. +- The scanner and planned discovery pipeline will process untrusted third-party + files, including archive formats like ZIP, DMG, and PKG. +- `.env` is ignored at the root, and per-worker `wrangler.toml` files are + already ignored in worker subdirectories. + +## What The Evidence Says + +### 1. Public repos can stay public if the operational boundary is private + +GitHub's own docs assume public repositories will use: + +- repository or environment secrets +- restricted organization secret access +- private vulnerability reporting +- automatic secret scanning on public repos + +That is strong evidence that the normal pattern is not "make the repo private"; +it is "keep sensitive operational material out of the repo and out of logs." + +### 2. Default GitHub Actions posture should be least privilege + +GitHub recommends: + +- minimum required `GITHUB_TOKEN` permissions +- default repository token permission set to read-only +- escalating permissions only per job +- using a GitHub App token if a job needs more than `GITHUB_TOKEN` can provide + +This matches what open-source repos increasingly do for deploy, release, and +cross-repo automation. + +### 3. Secrets are still easy to leak through logs and workflow behavior + +GitHub's secure-use docs explicitly warn that: + +- redaction is not guaranteed for transformed values +- structured blobs like JSON/YAML are poor secret formats +- non-secret values should be masked explicitly with `::add-mask::` +- exposed secrets in logs should trigger deletion/rotation + +For `doesitarm`, this directly applies to the current workflow that prints +secret-derived config files into CI output. + +### 4. Public repos should avoid self-hosted runners for untrusted PRs + +GitHub explicitly recommends self-hosted runners only with private +repositories, because forks of public repositories can run dangerous code on +them through pull requests. + +For this repo, that means: + +- do not put public PR workflows on a local machine or other long-lived + self-hosted runner +- do not run untrusted archive-processing jobs on a self-hosted runner that + also holds production credentials + +### 5. `pull_request_target` remains a common footgun + +GitHub Security Lab's `Preventing pwn requests` guidance is still the clearest +implementation reference: + +- `pull_request_target` plus checking out/building PR code is dangerous +- untrusted PR code should run in an unprivileged `pull_request` workflow +- privileged follow-up actions should happen through `workflow_run` with + carefully handled artifacts + +HN discussion around real workflow exploits reinforces the same point: the +problem is not theoretical. + +### 6. Common OSS hardening patterns for GitHub workflows are now well-defined + +GitHub secure-use guidance and OpenSSF best-practice guidance converge on: + +- pin actions to full commit SHAs +- restrict allowed actions where possible +- guard `.github/workflows/` with `CODEOWNERS` +- keep default branch protected +- require reviews and passing checks +- use code scanning / dependency review / secret scanning / Dependabot +- use private vulnerability reporting for public repos + +These are standard public-repo practices, not enterprise-only overkill. + +### 7. Cloudflare D1 already supports local-first development and tests + +Cloudflare's D1 docs explicitly support: + +- `wrangler dev` local mode +- `preview_database_id` +- `wrangler d1 migrations apply --local` +- test setups using Miniflare and `unstable_dev()` + +That means D1 does not require a private repo or remote-only workflow. It fits +the "run locally on this machine, then automate" plan well. + +### 8. Cloudflare Workflows and observability make Cloudflare a credible later home for ingestion + +Cloudflare Workflows now position themselves as durable multi-step execution +with retries, persisted state, and debugging. Workers Logs and Traces provide +native observability. That is enough evidence to treat Cloudflare as a viable +later landing zone for scheduled ingestion and scan orchestration. + +Inference: +GitHub Actions is still the easier first scheduler because it is already in the +repo, but Cloudflare Workflows has matured enough to stay in the plan as a +serious later option. + +### 9. Kriasoft's monorepo shape is a good architectural fit, but not every exact convention should be copied blindly + +`kriasoft/react-starter-kit` is a public monorepo with: + +- `apps/` +- `packages/` +- `db/` +- `docs/` +- `infra/` +- `scripts/` + +It also documents a public template env pattern where committed `.env` +contains placeholders/defaults and `.env.local` contains real credentials. + +That shape is a strong fit for `doesitarm`, but I would adapt the env pattern +slightly for safety and clarity: + +- keep a committed public template file such as `.env.example` +- keep real credentials in `.env.local`, `.dev.vars`, GitHub environment + secrets, and Cloudflare secrets + +Inference: +Kriasoft's folder layout is the part worth copying directly. The exact env-file +naming should follow the least-confusing safe convention for this repo. + +## Common Open-Source Patterns That Fit doesitarm + +### Public code, private state + +Keep public: + +- app code +- scanner code +- D1 schema and migrations +- workflow definitions +- docs and plans + +Keep private: + +- deploy credentials and tokens +- raw Google Sheets exports or database backups +- downloaded app archives +- quarantine samples +- private test fixtures that would create redistribution or abuse risk +- operational dashboards and alert destinations + +### Workspace monorepo with clear trust boundaries + +Best-fit structure for `doesitarm`: + +- `apps/web/` — Astro site and app-test UI +- `apps/default-worker/` — current `doesitarm-default` +- `apps/analytics-worker/` — current `workers/analytics` +- `apps/ingest/` or `apps/discovery/` — CLI/admin surface for discovery jobs +- `packages/scanner-core/` — shared scan engine and file-format logic +- `packages/source-runners/` — Homebrew/GitHub/download-page source runners +- `packages/data-model/` — shared D1 schema types, DTOs, validation +- `packages/site-build/` — list/build/export helpers +- `db/` — D1 migrations, seeds, import scripts, local test DB helpers +- `infra/` — Wrangler config, deploy config, policy docs +- `scripts/` — repo automation +- `docs/` — plans, research, operational docs + +### Repo template files, not repo secrets + +Common OSS pattern: + +- commit `.env.example` or placeholder-only `.env` +- ignore `.env.local`, `.dev.vars`, and `.wrangler/` +- keep Cloudflare secrets in Workers secrets / GitHub environment secrets + +### Hardened GitHub Actions for public forks + +Common OSS pattern: + +- default `permissions: { contents: read }` +- explicit per-job escalation only +- require approval for fork PR workflows where appropriate +- no self-hosted runners for public PRs +- no `pull_request_target` workflows that checkout/build PR code + +### Supply-chain hygiene for workflows + +Common OSS pattern: + +- pin actions to full SHAs +- restrict allowed actions +- Dependabot for action updates +- CodeQL / code scanning for workflow vulnerabilities +- OpenSSF Scorecards for ongoing hygiene checks + +### Disclosure and scanning defaults + +Common OSS pattern: + +- enable private vulnerability reporting +- enable secret scanning and push protection +- keep a `SECURITY.md` policy + +## What Works + +- Keeping the repo public while moving secrets and sensitive data out of git +- Refactoring to a monorepo before adding more D1/discovery complexity +- Treating workflow files, `infra/`, and `db/` as protected surfaces with + `CODEOWNERS` +- Using GitHub-hosted runners for public CI and scheduled jobs +- Using environment-specific secrets with required reviewers for production + deployment jobs +- Using D1 local mode and local migrations as part of normal development +- Using Cloudflare Logs/Traces or equivalent observability for scheduled jobs +- Storing raw archives and quarantine material in private object storage rather + than in the repo + +## What To Avoid + +- Do not move the whole repo private as a substitute for secrets hygiene +- Do not keep the current workflow behavior that prints secret-derived files to + CI logs +- Do not use self-hosted runners for public PR workflows +- Do not run archive downloads/extraction in privileged workflows that also have + deploy credentials +- Do not combine `pull_request_target` with explicit PR checkout/build steps +- Do not keep adding discovery/D1/worker code into the current flat root +- Do not commit raw import dumps, app archives, or structured secret blobs + +## Recommendation + +For `doesitarm`, the strongest next-step package is: + +1. Refactor toward a Kriasoft-style monorepo shape adapted to pnpm. +2. Add a security-hardening stage before expanding automation. +3. Keep the repo public. +4. Keep secrets, raw operational data, and archive/quarantine material private. +5. Start scheduled discovery on GitHub-hosted runners with hardened workflows. +6. Keep Cloudflare Workflows as a second-phase target for durable ingestion. + +Immediate high-priority actions to capture in the plan: + +1. Remove secret printing from + [deploy-cloudflare-workers.yml](/Users/athena/Code/doesitarm/.github/workflows/deploy-cloudflare-workers.yml) + and rotate affected secrets. +2. Add repo policy and tooling for: + - read-only default `GITHUB_TOKEN` + - pinned actions + - `CODEOWNERS` for `.github/workflows/`, `infra/`, and `db/` + - secret scanning / push protection + - private vulnerability reporting +3. Add ignored local-secret files for the new D1/Workers workflow: + - `.env.local` + - `.dev.vars` + - `.wrangler/` +4. Keep public PR CI on GitHub-hosted runners only. +5. Store raw archives/import snapshots outside the repo. + +## Missing Information + +- Whether the future ingestion runtime is expected to stay GitHub-first or + eventually move fully to Cloudflare Workers/Workflows. +- Whether there are legal or vendor-policy constraints around storing downloaded + app archives long term. +- Whether the monorepo refactor should keep Netlify as-is or consolidate more + runtime surfaces onto Cloudflare. + +## Source Links + +- GitHub Docs, `GITHUB_TOKEN` least-privilege and GitHub App escalation: + https://docs.github.com/en/actions/tutorials/authenticate-with-github_token +- GitHub Docs, secrets in Actions, fork-secret behavior, environment reviewers, + OIDC, and masking: + https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions +- GitHub Docs, secure use reference, pinning actions, CODEOWNERS, code scanning, + Dependabot, and Scorecards: + https://docs.github.com/en/actions/reference/security/secure-use +- GitHub Docs, self-hosted runner warning for public repositories: + https://docs.github.com/en/actions/how-tos/manage-runners/self-hosted-runners/add-runners +- GitHub Docs, limiting self-hosted runners in organizations: + https://docs.github.com/en/organizations/managing-organization-settings/disabling-or-limiting-github-actions-for-your-organization +- GitHub Docs, approval requirements for fork PR workflows: + https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/approving-workflow-runs-from-public-forks +- GitHub Docs, repository Actions settings and fork workflow controls: + https://docs.github.com/github/administering-a-repository/managing-repository-settings/disabling-or-limiting-github-actions-for-a-repository +- GitHub Docs, secret scanning for public repositories: + https://docs.github.com/github/administering-a-repository/about-token-scanning +- GitHub Docs, enabling secret scanning / push protection: + https://docs.github.com/en/code-security/how-tos/secure-your-secrets/detect-secret-leaks/enabling-secret-scanning-for-your-repository +- GitHub Docs, enabling push protection: + https://docs.github.com/en/code-security/secret-scanning/enabling-secret-scanning-features/enabling-push-protection-for-your-repository +- GitHub Docs, private vulnerability reporting: + https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/configuring-private-vulnerability-reporting-for-a-repository +- GitHub Security Lab, `pull_request_target` / `workflow_run` guidance: + https://securitylab.github.com/resources/github-actions-preventing-pwn-requests/ +- OpenSSF GitHub configuration best practices: + https://best.openssf.org/SCM-BestPractices/github/ +- Kriasoft React Starter Kit: + https://github.com/kriasoft/react-starter-kit +- Cloudflare D1 local development: + https://developers.cloudflare.com/d1/best-practices/local-development/ +- Cloudflare Workers observability: + https://developers.cloudflare.com/workers/observability/ +- Cloudflare Workers logs: + https://developers.cloudflare.com/workers/observability/logs/ +- Cloudflare Workers traces: + https://developers.cloudflare.com/workers/observability/traces/ +- Cloudflare Workflows overview: + https://developers.cloudflare.com/workflows/ + +## Source Quality Notes + +- Highest-confidence sources in this memo are GitHub Docs, GitHub Security Lab, + OpenSSF, Cloudflare Docs, and the Kriasoft repository itself. +- HN/Lobsters did not surface a materially better competing pattern in this + pass; the most useful HN signal reinforced GitHub Security Lab's warning on + `pull_request_target`. +- The recommendation to keep the repo public but move operational data private + is a synthesis from official guidance plus this repo's current shape and risk + surface. diff --git a/helpers/app-files-scanner.js b/helpers/app-files-scanner.js index f817a12..3f4e555 100644 --- a/helpers/app-files-scanner.js +++ b/helpers/app-files-scanner.js @@ -7,7 +7,7 @@ import { isString } from './check-types.js' import parseMacho from './macho/index.js' // Vite Web Workers - https://vitejs.dev/guide/features.html#web-workers -import { runScanWorker } from '~/helpers/scanner/client.mjs' +import { runScanWorker } from '~/helpers/scanner/client' const scannerVersion = (() => { // If there's no window @@ -341,6 +341,10 @@ export default class AppFilesScanner { .then( response => response.data ) .catch(function (error) { console.error(error) + + return { + supportedVersionNumber: null + } }) return { @@ -348,6 +352,47 @@ export default class AppFilesScanner { } } + getFinishedStatusMessage ({ + binarySupportsNative, + supportedVersionNumber + }) { + if ( binarySupportsNative ) { + return '✅ This app is natively compatible with Apple Silicon!' + } + + if ( supportedVersionNumber != null ) { + return [ + '✅ A native version of this has been reported', + ( isString( supportedVersionNumber ) && supportedVersionNumber.length > 0 ) ? `as of v${ supportedVersionNumber }` : null + ].filter( Boolean ).join(' ') + } + + return `🔶 This app file is not natively compatible with Apple Silicon and may only run via Rosetta 2 translation, however, software vendors will sometimes will ship separate install files for Intel and ARM instead of a single one. ` + } + + finishFileScan ( file, scanIndex, { + binarySupportsNative, + supportedVersionNumber + } ) { + file.statusMessage = this.getFinishedStatusMessage({ + binarySupportsNative, + supportedVersionNumber + }) + file.status = 'finished' + + if ( binarySupportsNative ) { + this.files.unshift( this.files.splice( scanIndex, 1 )[0] ) + } + } + + applyWorkerScanData ( file, scan ) { + file.appVersion = scan.appVersion || null + file.displayName = scan.displayName || file.displayName + file.details = Array.isArray( scan.details ) ? scan.details : [] + file.displayBinarySize = scan.displayBinarySize || null + file.binarySize = typeof scan.binarySize === 'number' ? scan.binarySize : null + } + async scanFile ( file, scanIndex ) { // If we've already scanned this @@ -553,26 +598,10 @@ export default class AppFilesScanner { console.log('supportedVersionNumber', supportedVersionNumber) - let finishedStatusMessage = '' - - if ( binarySupportsNative ) { - finishedStatusMessage = '✅ This app is natively compatible with Apple Silicon!' - - // Shift this scan to the top - this.files.unshift( this.files.splice( scanIndex, 1 )[0] ) - } else if ( supportedVersionNumber !== null ) { - - finishedStatusMessage = [ - '✅ A native version of this has been reported', - (supportedVersionNumber.length > 0) ? `as of v${supportedVersionNumber}` : null - ].join(' ') - - } else { - finishedStatusMessage = `🔶 This app file is not natively compatible with Apple Silicon and may only run via Rosetta 2 translation, however, software vendors will sometimes will ship separate install files for Intel and ARM instead of a single one. ` - } - - file.statusMessage = finishedStatusMessage - file.status = 'finished' + this.finishFileScan( file, scanIndex, { + binarySupportsNative, + supportedVersionNumber + } ) return } @@ -618,20 +647,45 @@ export default class AppFilesScanner { console.log( 'scannerVersion', scannerVersion ) if ( scannerVersion === '2' ) { + try { + const { scan } = await runScanWorker( file.instance, messageDetails => { + console.log( 'messageDetails', messageDetails ) + if ( isString( messageDetails.message ) ) { + file.statusMessage = messageDetails.message + } - const { scan } = await runScanWorker( file.instance, messageDetails => { - console.log( 'messageDetails', messageDetails ) + if ( isString( messageDetails.status ) ) { + file.status = messageDetails.status + } + } ) - file.statusMessage = messageDetails.message - file.status = messageDetails.status - } ) + this.applyWorkerScanData( file, scan ) - console.log('scan', scan) + const { supportedVersionNumber } = await this.submitScanInfo({ + filename: scan.info?.filename || file.name, + appVersion: scan.info?.appVersion || file.appVersion, + result: scan.info?.result || ( scan.binarySupportsNative ? '✅' : '🔶' ), + machoMeta: scan.info?.machoMeta || null, + infoPlist: scan.info?.infoPlist || null + }) - clearTimeout(timer) + this.finishFileScan( file, scanIndex, { + binarySupportsNative: Boolean( scan.binarySupportsNative ), + supportedVersionNumber + } ) - resolve() + clearTimeout(timer) + + resolve() + } catch ( error ) { + file.statusMessage = `❔ ${ error.message }` + file.status = 'finished' + + clearTimeout(timer) + + resolve() + } return } diff --git a/helpers/pagefind/load-sitemap-endpoints.ts b/helpers/pagefind/load-sitemap-endpoints.ts new file mode 100644 index 0000000..dd27fe4 --- /dev/null +++ b/helpers/pagefind/load-sitemap-endpoints.ts @@ -0,0 +1,63 @@ +import fs from 'fs-extra' +import axios from 'axios' + +import { + sitemapEndpointsPath +} from '~/helpers/pagefind/config.js' + +function shouldRetryError ( error: unknown ) { + const status = ( error as { response?: { status?: number } } )?.response?.status + + return typeof status === 'number' && status >= 500 +} + +async function fetchJsonWithRetries ( + url: string, + { + attempts = 3, + delayMs = 1000 + }: { + attempts?: number + delayMs?: number + } = {} +) { + let lastError: unknown + + for ( let attempt = 1; attempt <= attempts; attempt += 1 ) { + try { + const response = await axios.get( url ) + + return response.data + } catch ( error ) { + lastError = error + + if ( attempt >= attempts || !shouldRetryError( error ) ) { + throw error + } + + await new Promise( resolve => setTimeout( resolve, delayMs ) ) + } + } + + throw lastError +} + +export async function loadSitemapEndpoints () { + if ( await fs.pathExists( sitemapEndpointsPath ) ) { + return await fs.readJson( sitemapEndpointsPath ) + } + + if ( !process.env.PUBLIC_API_DOMAIN ) { + throw new Error(`Missing ${ sitemapEndpointsPath } and PUBLIC_API_DOMAIN is not set`) + } + + const apiUrl = new URL( process.env.PUBLIC_API_DOMAIN ) + apiUrl.pathname = sitemapEndpointsPath.replace(/^\.?\/?static\//, '/') + + return await fetchJsonWithRetries( apiUrl.toString() ) +} + +export { + fetchJsonWithRetries, + shouldRetryError +} diff --git a/helpers/public-runtime-config.mjs b/helpers/public-runtime-config.mjs index 7c30770..ecddf77 100644 --- a/helpers/public-runtime-config.mjs +++ b/helpers/public-runtime-config.mjs @@ -2,14 +2,25 @@ import dotenv from 'dotenv' dotenv.config() +const fallbackVerbiage = { + macs: 'Apple M4 Max or M3 Ultra Mac', + processors: 'Apple M4 Max and M3 Ultra' +} +function getRuntimeValue ( envValue, fallbackValue = null ) { + if ( typeof envValue === 'string' && envValue.length > 0 ) { + return envValue + } + + return fallbackValue +} export const publicRuntimeConfig = { allUpdateSubscribe: process.env.ALL_UPDATE_SUBSCRIBE, testResultStore: process.env.TEST_RESULT_STORE, siteUrl: process.env.URL, - macsVerbiage: process.env.npm_package_config_verbiage_macs, - processorsVerbiage: process.env.npm_package_config_verbiage_processors, + macsVerbiage: getRuntimeValue( process.env.npm_package_config_verbiage_macs, fallbackVerbiage.macs ), + processorsVerbiage: getRuntimeValue( process.env.npm_package_config_verbiage_processors, fallbackVerbiage.processors ), } export function makeViteDefinitions () { diff --git a/helpers/scanner/client.mjs b/helpers/scanner/client.mjs deleted file mode 100644 index 5dacdda..0000000 --- a/helpers/scanner/client.mjs +++ /dev/null @@ -1,104 +0,0 @@ -import AppScanWorker from './worker.mjs?worker' - -const noop = () => {} - -function getArrayBufferFromFileData ( file ) { - return new Promise( ( resolve, reject ) => { - - // If it has a .arrayBuffer function - // then return that - // (Likely a browser File blob) - if ( typeof file.arrayBuffer === 'function' ) { - file.arrayBuffer().then( resolve ) - - return - } - - // If it has a truthy .arrayBuffer property - // then return that - // (Likely a node File object) - if ( !!file?.arrayBuffer ) { - resolve( file.arrayBuffer ) - return - } - - // Assume it's a Node Buffer from fs.readFile - - - resolve( file.buffer ) - - // const hasFileReader = typeof FileReader !== 'undefined' - // const reader = hasFileReader ? new FileReader() : new FileApi.FileReader() - - // reader.onerror = function onerror ( readerEvent ) { - // reject( readerEvent.target.error ) - // } - - // reader.onload = function onload ( readerEvent ) { - // resolve( readerEvent.target.result ) - // } - - // reader.readAsArrayBuffer( file ) - }) -} - -export async function runScanWorker ( file, messageReceiver = noop ) { - // console.log( 'file', file ) - - const appScanWorker = new AppScanWorker() - - const fileArrayBuffer = ( typeof file.arrayBuffer === 'function' ) ? (await file.arrayBuffer()) : file.arrayBuffer - - if ( !fileArrayBuffer ) { - throw new Error( 'No fileArrayBuffer' ) - } - - const scan = await new Promise( ( resolve, reject ) => { - // Set up the worker message handler - appScanWorker.onmessage = async (event) => { - // console.log( 'Main received message', event ) - - const { status } = event.data - - messageReceiver( event.data ) - - // Resolves promise on finished status - if ( status === 'finished' ) { - const { scan } = event.data - resolve( scan ) - } - } - - // Set up the worker error handler - appScanWorker.onerror = async ( errorEvent ) => { - console.error( 'Error received from App Scan Worker', errorEvent ) - reject() - } - - - // Start the worker - // https://developer.mozilla.org/en-US/docs/Web/API/Worker/postMessage - appScanWorker.postMessage( { - status: 'start', - options: { - file: { - ...file, - // We put it into an array - // so that it's iterable for Blob - arrayBuffer: [ fileArrayBuffer ] - } - } - }, [ - // This array is our transferrable objects - // so that the App Scan Worker is allowed - // to use existing data from the main thread - // and we don't have to clone the data from scratch - fileArrayBuffer - ] ) - }) - - return { - scan, - appScanWorker - } -} diff --git a/helpers/scanner/client.ts b/helpers/scanner/client.ts new file mode 100644 index 0000000..ac3c993 --- /dev/null +++ b/helpers/scanner/client.ts @@ -0,0 +1,124 @@ +import AppScanWorker from './worker?worker' + +import type { + AppScanSnapshot, + ScanFileLike, + ScanMessage +} from './scan' + +const noop = () => {} + +type ScanMessageReceiver = ( details: ScanMessage ) => void + +interface WorkerScanFile extends ScanFileLike { + arrayBuffer: ArrayBuffer +} + +interface WorkerFinishedMessage extends ScanMessage { + error?: { + message?: string + } + scan?: AppScanSnapshot + status: 'finished' +} + +function toArrayBuffer ( value: ArrayBuffer | ArrayBufferView ) { + if ( value instanceof ArrayBuffer ) { + return value + } + + return new Uint8Array( + value.buffer, + value.byteOffset, + value.byteLength + ).slice().buffer +} + +function isWorkerFinishedMessage ( details: ScanMessage | WorkerFinishedMessage ): details is WorkerFinishedMessage { + return details.status === 'finished' +} + +async function getArrayBufferFromFileData ( file: ScanFileLike ) { + if ( typeof file.arrayBuffer === 'function' ) { + return await file.arrayBuffer() + } + + if ( file.arrayBuffer instanceof ArrayBuffer ) { + return file.arrayBuffer + } + + if ( file.buffer instanceof ArrayBuffer ) { + return file.buffer + } + + if ( ArrayBuffer.isView( file.buffer ) ) { + return toArrayBuffer( file.buffer ) + } + + throw new Error( 'No fileArrayBuffer' ) +} + +function makeWorkerFile ( file: ScanFileLike, arrayBuffer: ArrayBuffer ): WorkerScanFile { + return { + arrayBuffer, + name: file.name, + size: file.size ?? arrayBuffer.byteLength, + type: file.type ?? file.mimeType ?? '' + } +} + +export async function runScanWorker ( + file: ScanFileLike, + messageReceiver: ScanMessageReceiver = noop +) { + const AppScanWorkerConstructor = AppScanWorker as unknown as { new (): Worker } + const appScanWorker = new AppScanWorkerConstructor() + const fileArrayBuffer = await getArrayBufferFromFileData( file ) + const workerFile = makeWorkerFile( file, fileArrayBuffer ) + + const scan = await new Promise( ( resolve, reject ) => { + const cleanup = () => { + appScanWorker.onmessage = null + appScanWorker.onerror = null + appScanWorker.terminate() + } + + appScanWorker.onmessage = ( event: MessageEvent ) => { + const details = event.data + + messageReceiver( details ) + + if ( !isWorkerFinishedMessage( details ) ) { + return + } + + cleanup() + + if ( details.scan ) { + resolve( details.scan ) + return + } + + reject( new Error( details.error?.message || details.message || 'Worker finished without a scan result.' ) ) + } + + appScanWorker.onerror = ( errorEvent: ErrorEvent ) => { + cleanup() + reject( new Error( errorEvent.message || 'Error received from App Scan Worker' ) ) + } + + appScanWorker.postMessage( { + status: 'start', + options: { + file: workerFile + } + }, [ + fileArrayBuffer + ] ) + } ) + + return { + appScanWorker, + scan + } +} diff --git a/helpers/scanner/file-api.js b/helpers/scanner/file-api.js deleted file mode 100644 index 16450a6..0000000 --- a/helpers/scanner/file-api.js +++ /dev/null @@ -1,333 +0,0 @@ -import { EventEmitter } from 'events' - -export function File (input) { - var self = this; - - function updateStat(stat) { - self.stat = stat; - self.lastModifiedDate = self.stat.mtime; - self.size = self.stat.size; - } - - if ('string' === typeof input) { - self.path = input; - } else { - Object.keys(input).forEach(function (k) { - self[k] = input[k]; - }); - } - - self.name = self.name// || path.basename(self.path||''); - if (!self.name) { - throw new Error("No name"); - } - self.type = self.type// || mime.lookup(self.name); - - if (!self.path) { - if (self.buffer) { - self.size = self.buffer.length; - } else if (!self.stream) { - throw new Error('No input, nor stream, nor buffer.'); - } - return; - } - - if (!self.jsdom) { - return; - } - - // if (!self.async) { - // updateStat(fs.statSync(self.path)); - // } else { - // fs.stat(self.path, function (err, stat) { - // updateStat(stat); - // }); - // } -} - - -function doop(fn, args, context) { - if ('function' === typeof fn) { - fn.apply(context, args); - } -} - -function toDataUrl(data, type) { - // var data = self.result; - var dataUrl = 'data:'; - - if (type) { - dataUrl += type + ';'; - } - - if (/text/i.test(type)) { - dataUrl += 'charset=utf-8,'; - dataUrl += data.toString('utf8'); - } else { - dataUrl += 'base64,'; - dataUrl += data.toString('base64'); - } - - return dataUrl; -} - -function mapDataToFormat(file, data, format, encoding) { - // var data = self.result; - - switch (format) { - case 'buffer': - return data; - break; - case 'binary': - return data.toString('binary'); - break; - case 'dataUrl': - return toDataUrl(data, file.type); - break; - case 'text': - return data.toString(encoding || 'utf8'); - break; - } -} - -export function FileReader () { - var self = this, - emitter = new EventEmitter, - file; - - self.addEventListener = function(on, callback) { - emitter.on(on, callback); - }; - self.removeEventListener = function(callback) { - emitter.removeListener(callback); - } - self.dispatchEvent = function(on) { - emitter.emit(on); - } - - self.EMPTY = 0; - self.LOADING = 1; - self.DONE = 2; - - self.error = undefined; // Read only - self.readyState = self.EMPTY; // Read only - self.result = undefined; // Road only - - // non-standard - self.on = function() { - emitter.on.apply(emitter, arguments); - } - self.nodeChunkedEncoding = false; - self.setNodeChunkedEncoding = function(val) { - self.nodeChunkedEncoding = val; - }; - // end non-standard - - - - // Whatever the file object is, turn it into a Node.JS File.Stream - function createFileStream() { - var stream = new EventEmitter(), - chunked = self.nodeChunkedEncoding; - - // attempt to make the length computable - // if (!file.size && chunked && file.path) { - // fs.stat(file.path, function(err, stat) { - // file.size = stat.size; - // file.lastModifiedDate = stat.mtime; - // }); - // } - - - // The stream exists, do nothing more - if (file.stream) { - return; - } - - - // Create a read stream from a buffer - if (file.buffer) { - process.nextTick(function() { - stream.emit('data', file.buffer); - stream.emit('end'); - }); - file.stream = stream; - return; - } - - - // Create a read stream from a file - // if (file.path) { - // // TODO url - // if (!chunked) { - // fs.readFile(file.path, function(err, data) { - // if (err) { - // stream.emit('error', err); - // } - // if (data) { - // stream.emit('data', data); - // stream.emit('end'); - // } - // }); - - // file.stream = stream; - // return; - // } - - // // TODO don't duplicate this code here, - // // expose a method in File instead - // file.stream = fs.createReadStream(file.path); - // } - } - - - - // before any other listeners are added - emitter.on('abort', function() { - self.readyState = self.DONE; - }); - - - - // Map `error`, `progress`, `load`, and `loadend` - function mapStreamToEmitter(format, encoding) { - var stream = file.stream, - buffers = [], - chunked = self.nodeChunkedEncoding; - - buffers.dataLength = 0; - - stream.on('error', function(err) { - if (self.DONE === self.readyState) { - return; - } - - self.readyState = self.DONE; - self.error = err; - emitter.emit('error', err); - }); - - stream.on('data', function(data) { - if (self.DONE === self.readyState) { - return; - } - - buffers.dataLength += data.length; - buffers.push(data); - - emitter.emit('progress', { - // fs.stat will probably complete before this - // but possibly it will not, hence the check - lengthComputable: (!isNaN(file.size)) ? true : false, - loaded: buffers.dataLength, - total: file.size - }); - - emitter.emit('data', data); - }); - - stream.on('end', function() { - if (self.DONE === self.readyState) { - return; - } - - var data; - - if (buffers.length > 1) { - data = Buffer.concat(buffers); - } else { - data = buffers[0]; - } - - self.readyState = self.DONE; - self.result = mapDataToFormat(file, data, format, encoding); - emitter.emit('load', { - target: { - // non-standard - nodeBufferResult: data, - result: self.result - } - }); - - emitter.emit('loadend'); - }); - } - - - // Abort is overwritten by readAsXyz - self.abort = function() { - if (self.readState == self.DONE) { - return; - } - self.readyState = self.DONE; - emitter.emit('abort'); - }; - - - - // - function mapUserEvents() { - emitter.on('start', function() { - doop(self.onloadstart, arguments); - }); - emitter.on('progress', function() { - doop(self.onprogress, arguments); - }); - emitter.on('error', function(err) { - // TODO translate to FileError - if (self.onerror) { - self.onerror(err); - } else { - if (!emitter.listeners.error || !emitter.listeners.error.length) { - throw err; - } - } - }); - emitter.on('load', function() { - doop(self.onload, arguments); - }); - emitter.on('end', function() { - doop(self.onloadend, arguments); - }); - emitter.on('abort', function() { - doop(self.onabort, arguments); - }); - } - - - - function readFile(_file, format, encoding) { - file = _file; - if (!file || !file.name || !(file.path || file.stream || file.buffer)) { - throw new Error("cannot read as File: " + JSON.stringify(file).slice(0, 1000)); - } - if (0 !== self.readyState) { - console.log("already loading, request to change format ignored"); - return; - } - - // 'process.nextTick' does not ensure order, (i.e. an fs.stat queued later may return faster) - // but `onloadstart` must come before the first `data` event and must be asynchronous. - // Hence we waste a single tick waiting - process.nextTick(function() { - self.readyState = self.LOADING; - emitter.emit('loadstart'); - createFileStream(); - mapStreamToEmitter(format, encoding); - mapUserEvents(); - }); - } - - self.readAsArrayBuffer = function(file) { - readFile(file, 'buffer'); - }; - self.readAsBinaryString = function(file) { - readFile(file, 'binary'); - }; - self.readAsDataURL = function(file) { - readFile(file, 'dataUrl'); - }; - self.readAsText = function(file, encoding) { - readFile(file, 'text', encoding); - }; -} diff --git a/helpers/scanner/file-api.ts b/helpers/scanner/file-api.ts new file mode 100644 index 0000000..788ff1d --- /dev/null +++ b/helpers/scanner/file-api.ts @@ -0,0 +1,337 @@ +import { EventEmitter } from 'events' + +type ReadFormat = 'buffer' | 'binary' | 'dataUrl' | 'text' + +type FileInput = string | Partial +type NodeBuffer = ReturnType + +type FileReaderEventName = 'abort' | 'error' | 'load' | 'loadend' | 'loadstart' | 'progress' + +interface FileReaderProgressEvent { + lengthComputable: boolean + loaded: number + total?: number +} + +export interface FileReaderLoadEvent { + target: { + nodeBufferResult: NodeBuffer + result: NodeBuffer | string + } +} + +interface FileReaderErrorEvent { + target: { + error: Error + } +} + +type FileReaderEventPayload = FileReaderProgressEvent | FileReaderLoadEvent | FileReaderErrorEvent | undefined + +type FileReaderListener = ( event?: FileReaderEventPayload ) => void + +interface NodeFileStat { + mtime: Date + size: number +} + +type NodeFileStream = EventEmitter + +export interface NodeFile { + blob?: Blob + buffer?: NodeBuffer + jsdom?: boolean + lastModifiedDate?: Date + name: string + path?: string + size?: number + stat?: NodeFileStat + stream?: NodeFileStream + type?: string +} + +function invokeIfFunction ( listener: unknown, args: unknown[], context: unknown ) { + if ( typeof listener === 'function' ) { + listener.apply( context, args ) + } +} + +function toDataUrl ( data: NodeBuffer, type?: string ) { + let dataUrl = 'data:' + + if ( type ) { + dataUrl += `${ type };` + } + + if ( /text/i.test( type || '' ) ) { + dataUrl += 'charset=utf-8,' + dataUrl += data.toString( 'utf8' ) + } else { + dataUrl += 'base64,' + dataUrl += data.toString( 'base64' ) + } + + return dataUrl +} + +function mapDataToFormat ( file: NodeFile, data: NodeBuffer, format: ReadFormat, encoding?: BufferEncoding ) { + switch ( format ) { + case 'buffer': + return data + case 'binary': + return data.toString( 'binary' ) + case 'dataUrl': + return toDataUrl( data, file.type ) + case 'text': + return data.toString( encoding || 'utf8' ) + } +} + +export class File implements NodeFile { + blob?: Blob + buffer?: Buffer + jsdom?: boolean + lastModifiedDate?: Date + name: string + path?: string + size?: number + stat?: NodeFileStat + stream?: NodeFileStream + type?: string + + constructor ( input: FileInput ) { + if ( typeof input === 'string' ) { + this.path = input + } else { + Object.assign( this, input ) + } + + if ( !this.name ) { + throw new Error( 'No name' ) + } + + if ( !this.path ) { + if ( this.buffer ) { + this.size = this.buffer.length + } else if ( !this.stream ) { + throw new Error( 'No input, nor stream, nor buffer.' ) + } + + return + } + + if ( !this.jsdom ) { + return + } + } +} + +export class FileReader { + readonly EMPTY = 0 + readonly LOADING = 1 + readonly DONE = 2 + + error?: Error + onabort?: FileReaderListener + onerror?: FileReaderListener + onload?: FileReaderListener + onloadend?: FileReaderListener + onloadstart?: FileReaderListener + onprogress?: FileReaderListener + readyState = this.EMPTY + result?: NodeBuffer | string + + private readonly emitter = new EventEmitter() + private file?: NodeFile + private fileStream?: NodeFileStream + private format?: ReadFormat + private encoding?: BufferEncoding + private readonly registeredEvents = new Set() + + nodeChunkedEncoding = false + + addEventListener ( eventName: FileReaderEventName, callback: FileReaderListener ) { + this.emitter.on( eventName, callback ) + } + + removeEventListener ( eventName: FileReaderEventName, callback: FileReaderListener ) { + this.emitter.removeListener( eventName, callback ) + } + + dispatchEvent ( eventName: FileReaderEventName, payload?: FileReaderEventPayload ) { + this.emitter.emit( eventName, payload ) + } + + on ( eventName: string | symbol, listener: ( ...args: any[] ) => void ) { + this.emitter.on( eventName, listener ) + } + + setNodeChunkedEncoding ( value: boolean ) { + this.nodeChunkedEncoding = value + } + + abort () { + if ( this.readyState === this.DONE ) { + return + } + + this.readyState = this.DONE + this.dispatchEvent( 'abort' ) + } + + readAsArrayBuffer ( file: NodeFile ) { + this.readFile( file, 'buffer' ) + } + + readAsBinaryString ( file: NodeFile ) { + this.readFile( file, 'binary' ) + } + + readAsDataURL ( file: NodeFile ) { + this.readFile( file, 'dataUrl' ) + } + + readAsText ( file: NodeFile, encoding?: BufferEncoding ) { + this.readFile( file, 'text', encoding ) + } + + private createFileStream () { + if ( this.file?.stream ) { + this.fileStream = this.file.stream + return + } + + if ( this.file?.buffer ) { + const stream = new EventEmitter() as NodeFileStream + + process.nextTick( () => { + stream.emit( 'data', this.file!.buffer! ) + stream.emit( 'end' ) + } ) + + this.file!.stream = stream + this.fileStream = stream + } + } + + private registerUserEvents () { + if ( this.registeredEvents.size > 0 ) { + return + } + + const userEvents: Array<[ FileReaderEventName, 'onloadstart' | 'onprogress' | 'onload' | 'onloadend' | 'onabort' ]> = [ + [ 'loadstart', 'onloadstart' ], + [ 'progress', 'onprogress' ], + [ 'load', 'onload' ], + [ 'loadend', 'onloadend' ], + [ 'abort', 'onabort' ] + ] + + for ( const [ eventName, propertyName ] of userEvents ) { + this.emitter.on( eventName, ( event?: FileReaderEventPayload ) => { + invokeIfFunction( this[ propertyName ], [ event ], this ) + } ) + this.registeredEvents.add( eventName ) + } + + this.emitter.on( 'error', ( event?: FileReaderEventPayload ) => { + if ( typeof this.onerror === 'function' ) { + this.onerror( event ) + return + } + + const error = ( event as FileReaderErrorEvent | undefined )?.target.error + + if ( error && this.emitter.listenerCount( 'error' ) <= 1 ) { + throw error + } + } ) + this.registeredEvents.add( 'error' ) + } + + private mapStreamToEmitter () { + const stream = this.fileStream + + if ( !stream || !this.file || !this.format ) { + return + } + + const buffers: NodeBuffer[] = [] + let dataLength = 0 + + stream.on( 'error', ( error: Error ) => { + if ( this.readyState === this.DONE ) { + return + } + + this.readyState = this.DONE + this.error = error + this.dispatchEvent( 'error', { + target: { + error + } + } ) + } ) + + stream.on( 'data', ( data: NodeBuffer ) => { + if ( this.readyState === this.DONE ) { + return + } + + dataLength += data.length + buffers.push( data ) + + this.dispatchEvent( 'progress', { + lengthComputable: !Number.isNaN( this.file?.size ), + loaded: dataLength, + total: this.file?.size + } ) + } ) + + stream.on( 'end', () => { + if ( this.readyState === this.DONE ) { + return + } + + const data = buffers.length > 1 + ? Buffer.concat( buffers as unknown as readonly Uint8Array[] ) as NodeBuffer + : ( buffers[ 0 ] || Buffer.alloc( 0 ) ) + + this.readyState = this.DONE + this.result = mapDataToFormat( this.file!, data, this.format!, this.encoding ) + + const event = { + target: { + nodeBufferResult: data, + result: this.result + } + } + + this.dispatchEvent( 'load', event ) + this.dispatchEvent( 'loadend', event ) + } ) + } + + private readFile ( file: NodeFile, format: ReadFormat, encoding?: BufferEncoding ) { + this.file = file + this.format = format + this.encoding = encoding + + if ( !this.file || !this.file.name || !( this.file.path || this.file.stream || this.file.buffer ) ) { + throw new Error( `cannot read as File: ${ JSON.stringify( this.file ).slice( 0, 1000 ) }` ) + } + + if ( this.readyState !== this.EMPTY ) { + console.log( 'already loading, request to change format ignored' ) + return + } + + process.nextTick( () => { + this.readyState = this.LOADING + this.dispatchEvent( 'loadstart' ) + this.createFileStream() + this.mapStreamToEmitter() + this.registerUserEvents() + } ) + } +} diff --git a/helpers/scanner/parsers/plist-parser.ts b/helpers/scanner/parsers/plist-parser.ts new file mode 100644 index 0000000..5132a8a --- /dev/null +++ b/helpers/scanner/parsers/plist-parser.ts @@ -0,0 +1,384 @@ +// Adapted for browser+node from https://github.com/joeferner/node-bplist-parser/blob/master/bplistParser.js +import plainTextPlist from 'plist' +import { Buffer } from 'buffer/index.js' + +const debug = false + +export const maxObjectSize = 100 * 1000 * 1000 +export const maxObjectCount = 32768 + +const EPOCH = 978307200000 +type NodeBuffer = ReturnType + +export class UID { + UID: number + + constructor ( id: number ) { + this.UID = id + } +} + +type PlistValue = + | null + | boolean + | number + | bigint + | string + | Date + | Buffer + | UID + | PlistValue[] + | { [ key: string ]: PlistValue } + +export function parsePlistBuffer ( + fileBuffer: Uint8Array | NodeBuffer, + callback?: ( error: Error | null, result?: PlistValue ) => void +) { + return new Promise( ( resolve, reject ) => { + function tryParseBuffer ( buffer: Uint8Array | NodeBuffer ) { + let error: Error | null = null + let result: PlistValue | undefined + + try { + result = parseBuffer( buffer ) + resolve( result ) + } catch ( caughtError ) { + error = caughtError as Error + reject( error ) + } finally { + if ( callback ) { + callback( error, result ) + } + } + } + + tryParseBuffer( fileBuffer ) + } ) +} + +export function parseFileSync ( fileNameOrBuffer: Uint8Array | NodeBuffer ) { + return parseBuffer( fileNameOrBuffer ) +} + +function parseBuffer ( inputBuffer: Uint8Array | NodeBuffer ): PlistValue { + const buffer = Buffer.from( inputBuffer ) + const header = buffer.slice( 0, 'bplist'.length ).toString( 'utf8' ) + const isPlainTextPlist = header.includes( ' maxObjectCount ) { + throw new Error( 'maxObjectCount exceeded' ) + } + + const offsetTable: number[] = [] + + for ( let index = 0; index < numObjects; index += 1 ) { + const offsetBytes = buffer.slice( + offsetTableOffset + index * offsetSize, + offsetTableOffset + ( index + 1 ) * offsetSize + ) + + offsetTable[ index ] = readUInt( offsetBytes, 0 ) + + if ( debug ) { + console.log( `Offset for Object #${ index } is ${ offsetTable[ index ] } [${ offsetTable[ index ].toString( 16 ) }]` ) + } + } + + function parseObject ( tableOffset: number ): PlistValue { + const offset = offsetTable[ tableOffset ] + const type = buffer[ offset ] + const objType = ( type & 0xF0 ) >> 4 + const objInfo = ( type & 0x0F ) + + switch ( objType ) { + case 0x0: + return parseSimple() + case 0x1: + return parseInteger() + case 0x8: + return parseUID() + case 0x2: + return parseReal() + case 0x3: + return parseDate() + case 0x4: + return parseData() + case 0x5: + return parsePlistString() + case 0x6: + return parsePlistString( true ) + case 0xA: + return parseArray() + case 0xD: + return parseDictionary() + default: + throw new Error( `Unhandled type 0x${ objType.toString( 16 ) }` ) + } + + function parseSimple (): PlistValue { + switch ( objInfo ) { + case 0x0: + return null + case 0x8: + return false + case 0x9: + return true + case 0xF: + return null + default: + throw new Error( `Unhandled simple type 0x${ objType.toString( 16 ) }` ) + } + } + + function bufferToHexString ( inputBuffer: NodeBuffer ) { + let result = '' + let index = 0 + + for ( ; index < inputBuffer.length; index += 1 ) { + if ( inputBuffer[ index ] !== 0x00 ) { + break + } + } + + for ( ; index < inputBuffer.length; index += 1 ) { + const part = `00${ inputBuffer[ index ].toString( 16 ) }` + result += part.slice( part.length - 2 ) + } + + return result + } + + function parseInteger (): PlistValue { + const length = Math.pow( 2, objInfo ) + + if ( length >= maxObjectSize ) { + throw new Error( `Too little heap space available! Wanted to read ${ length } bytes, but only ${ maxObjectSize } are available.` ) + } + + const data = buffer.slice( offset + 1, offset + 1 + length ) + + if ( length === 16 ) { + const hex = bufferToHexString( data ) + return BigInt( `0x${ hex }` ) + } + + return data.reduce( ( accumulator, currentValue ) => { + accumulator <<= 8 + accumulator |= currentValue & 255 + return accumulator + }, 0 ) + } + + function parseUID (): PlistValue { + const length = objInfo + 1 + + if ( length >= maxObjectSize ) { + throw new Error( `Too little heap space available! Wanted to read ${ length } bytes, but only ${ maxObjectSize } are available.` ) + } + + return new UID( readUInt( buffer.slice( offset + 1, offset + 1 + length ) ) ) + } + + function parseReal (): PlistValue { + const length = Math.pow( 2, objInfo ) + + if ( length >= maxObjectSize ) { + throw new Error( `Too little heap space available! Wanted to read ${ length } bytes, but only ${ maxObjectSize } are available.` ) + } + + const realBuffer = buffer.slice( offset + 1, offset + 1 + length ) + + if ( length === 4 ) { + return realBuffer.readFloatBE( 0 ) + } + + if ( length === 8 ) { + return realBuffer.readDoubleBE( 0 ) + } + + throw new Error( `Unsupported real length ${ length }` ) + } + + function parseDate (): PlistValue { + if ( objInfo !== 0x3 ) { + console.error( `Unknown date type :${ objInfo }. Parsing anyway...` ) + } + + const dateBuffer = buffer.slice( offset + 1, offset + 9 ) + return new Date( EPOCH + ( 1000 * dateBuffer.readDoubleBE( 0 ) ) ) + } + + function readLength ( kind: string ) { + let dataOffset = 1 + let length = objInfo + + if ( objInfo === 0xF ) { + const intTypeByte = buffer[ offset + 1 ] + const intType = ( intTypeByte & 0xF0 ) / 0x10 + + if ( intType !== 0x1 ) { + console.error( `${ kind }: UNEXPECTED LENGTH-INT TYPE! ${ intType }` ) + } + + const intInfo = intTypeByte & 0x0F + const intLength = Math.pow( 2, intInfo ) + dataOffset = 2 + intLength + length = readUInt( buffer.slice( offset + 2, offset + 2 + intLength ) ) + } + + return { + dataOffset, + length + } + } + + function parseData (): PlistValue { + const { dataOffset, length } = readLength( '0x4' ) + + if ( length >= maxObjectSize ) { + throw new Error( `Too little heap space available! Wanted to read ${ length } bytes, but only ${ maxObjectSize } are available.` ) + } + + return buffer.slice( offset + dataOffset, offset + dataOffset + length ) + } + + function parsePlistString ( isUtf16 = false ): PlistValue { + let encoding: BufferEncoding = 'utf8' + const { dataOffset, length: rawLength } = readLength( 'string' ) + const length = rawLength * ( isUtf16 ? 2 : 1 ) + + if ( length >= maxObjectSize ) { + throw new Error( `Too little heap space available! Wanted to read ${ length } bytes, but only ${ maxObjectSize } are available.` ) + } + + let plistString = Buffer.from( buffer.slice( offset + dataOffset, offset + dataOffset + length ) ) + + if ( isUtf16 ) { + plistString = swapBytes( plistString ) + encoding = 'ucs2' + } + + return plistString.toString( encoding ) + } + + function parseArray (): PlistValue { + const { dataOffset, length } = readLength( '0xa' ) + + if ( length * objectRefSize > maxObjectSize ) { + throw new Error( 'Too little heap space available!' ) + } + + const array: PlistValue[] = [] + + for ( let index = 0; index < length; index += 1 ) { + const objectRef = readUInt( + buffer.slice( + offset + dataOffset + index * objectRefSize, + offset + dataOffset + ( index + 1 ) * objectRefSize + ) + ) + + array[ index ] = parseObject( objectRef ) + } + + return array + } + + function parseDictionary (): PlistValue { + const { dataOffset, length } = readLength( '0xd' ) + + if ( length * 2 * objectRefSize > maxObjectSize ) { + throw new Error( 'Too little heap space available!' ) + } + + const dictionary: Record = {} + + for ( let index = 0; index < length; index += 1 ) { + const keyRef = readUInt( + buffer.slice( + offset + dataOffset + index * objectRefSize, + offset + dataOffset + ( index + 1 ) * objectRefSize + ) + ) + const valueRef = readUInt( + buffer.slice( + offset + dataOffset + length * objectRefSize + index * objectRefSize, + offset + dataOffset + length * objectRefSize + ( index + 1 ) * objectRefSize + ) + ) + const key = parseObject( keyRef ) + + if ( typeof key !== 'string' ) { + throw new Error( 'Dictionary key is not a string' ) + } + + dictionary[ key ] = parseObject( valueRef ) + } + + return dictionary + } + } + + return parseObject( topObject ) +} + +function readUInt64BE ( buffer: NodeBuffer, offset: number ) { + const data = buffer.slice( offset, offset + 8 ) + + return data.reduce( ( accumulator, currentValue ) => { + accumulator <<= 8 + accumulator |= currentValue & 0xff + return accumulator + }, 0 ) +} + +function readUInt ( buffer: NodeBuffer, start = 0 ) { + return buffer.slice( start ).reduce( ( accumulator, currentValue ) => { + accumulator <<= 8 + accumulator |= currentValue & 0xff + return accumulator + }, 0 ) +} + +function swapBytes ( buffer: NodeBuffer ) { + const length = buffer.length + + if ( length % 2 !== 0 ) { + throw new Error( 'Buffer length must be even' ) + } + + for ( let index = 0; index < length; index += 2 ) { + const currentValue = buffer[ index ] + buffer[ index ] = buffer[ index + 1 ] + buffer[ index + 1 ] = currentValue + } + + return buffer +} diff --git a/helpers/scanner/parsers/plist.js b/helpers/scanner/parsers/plist.js deleted file mode 100644 index d030d07..0000000 --- a/helpers/scanner/parsers/plist.js +++ /dev/null @@ -1,385 +0,0 @@ -// Adpapted for browser+node from https://github.com/joeferner/node-bplist-parser/blob/master/bplistParser.js -import plainTextPlist from 'plist' -import { Buffer } from 'buffer/index.js' - - -// const fs = require('fs'); -// const bufferApi = require('buffer') -const debug = false - -export const maxObjectSize = 100 * 1000 * 1000; // 100Meg -export const maxObjectCount = 32768; - -// EPOCH = new SimpleDateFormat("yyyy MM dd zzz").parse("2001 01 01 GMT").getTime(); -// ...but that's annoying in a static initializer because it can throw exceptions, ick. -// So we just hardcode the correct value. -const EPOCH = 978307200000; - -// UID object definition -const UID = function(id) { - this.UID = id; -}; - -export function parsePlistBuffer ( fileBuffer , callback) { - return new Promise(function(resolve, reject) { - function tryParseBuffer(buffer) { - let err = null; - let result; - try { - result = parseBuffer(buffer); - resolve(result); - } catch (ex) { - err = ex; - reject(err); - } finally { - if (callback) callback(err, result); - } - } - - return tryParseBuffer( fileBuffer ) - - // if (Buffer.isBuffer(fileNameOrBuffer)) { - - // } - // fs.readFile(fileNameOrBuffer, function(err, data) { - // if (err) { - // reject(err); - // return callback(err); - // } - // tryParseBuffer(data); - // }); - }); -}; - -export function parseFileSync (fileNameOrBuffer) { - // if (!Buffer.isBuffer(fileNameOrBuffer)) { - // fileNameOrBuffer = fs.readFileSync(fileNameOrBuffer); - // } - return parseBuffer(fileNameOrBuffer); -}; - -function parseBuffer ( buffer ) { - // check header - const header = buffer.slice(0, 'bplist'.length).toString('utf8'); - - - - const isPlainTextPlist = header.includes(' maxObjectCount) { - throw new Error("maxObjectCount exceeded"); - } - - // Handle offset table - const offsetTable = []; - - for (let i = 0; i < numObjects; i++) { - const offsetBytes = buffer.slice(offsetTableOffset + i * offsetSize, offsetTableOffset + (i + 1) * offsetSize); - offsetTable[i] = readUInt(offsetBytes, 0); - if (debug) { - console.log("Offset for Object #" + i + " is " + offsetTable[i] + " [" + offsetTable[i].toString(16) + "]"); - } - } - - // Parses an object inside the currently parsed binary property list. - // For the format specification check - // - // Apple's binary property list parser implementation. - function parseObject(tableOffset) { - const offset = offsetTable[tableOffset]; - const type = buffer[offset]; - const objType = (type & 0xF0) >> 4; //First 4 bits - const objInfo = (type & 0x0F); //Second 4 bits - switch (objType) { - case 0x0: - return parseSimple(); - case 0x1: - return parseInteger(); - case 0x8: - return parseUID(); - case 0x2: - return parseReal(); - case 0x3: - return parseDate(); - case 0x4: - return parseData(); - case 0x5: // ASCII - return parsePlistString(); - case 0x6: // UTF-16 - return parsePlistString(true); - case 0xA: - return parseArray(); - case 0xD: - return parseDictionary(); - default: - throw new Error("Unhandled type 0x" + objType.toString(16)); - } - - function parseSimple() { - //Simple - switch (objInfo) { - case 0x0: // null - return null; - case 0x8: // false - return false; - case 0x9: // true - return true; - case 0xF: // filler byte - return null; - default: - throw new Error("Unhandled simple type 0x" + objType.toString(16)); - } - } - - function bufferToHexString(buffer) { - let str = ''; - let i; - for (i = 0; i < buffer.length; i++) { - if (buffer[i] != 0x00) { - break; - } - } - for (; i < buffer.length; i++) { - const part = '00' + buffer[i].toString(16); - str += part.substr(part.length - 2); - } - return str; - } - - function parseInteger() { - const length = Math.pow(2, objInfo); - if (length < maxObjectSize) { - const data = buffer.slice(offset + 1, offset + 1 + length); - if (length === 16) { - const str = bufferToHexString(data); - return BigInt(str, 16); - } - return data.reduce((acc, curr) => { - acc <<= 8; - acc |= curr & 255; - return acc; - }); - } - throw new Error("Too little heap space available! Wanted to read " + length + " bytes, but only " + maxObjectSize + " are available."); - - } - - function parseUID() { - const length = objInfo + 1; - if (length < maxObjectSize) { - return new UID(readUInt(buffer.slice(offset + 1, offset + 1 + length))); - } - throw new Error("Too little heap space available! Wanted to read " + length + " bytes, but only " + maxObjectSize + " are available."); - } - - function parseReal() { - const length = Math.pow(2, objInfo); - if (length < maxObjectSize) { - const realBuffer = buffer.slice(offset + 1, offset + 1 + length); - if (length === 4) { - return realBuffer.readFloatBE(0); - } - if (length === 8) { - return realBuffer.readDoubleBE(0); - } - } else { - throw new Error("Too little heap space available! Wanted to read " + length + " bytes, but only " + maxObjectSize + " are available."); - } - } - - function parseDate() { - if (objInfo != 0x3) { - console.error("Unknown date type :" + objInfo + ". Parsing anyway..."); - } - const dateBuffer = buffer.slice(offset + 1, offset + 9); - return new Date(EPOCH + (1000 * dateBuffer.readDoubleBE(0))); - } - - function parseData() { - let dataoffset = 1; - let length = objInfo; - if (objInfo == 0xF) { - const int_type = buffer[offset + 1]; - const intType = (int_type & 0xF0) / 0x10; - if (intType != 0x1) { - console.error("0x4: UNEXPECTED LENGTH-INT TYPE! " + intType); - } - const intInfo = int_type & 0x0F; - const intLength = Math.pow(2, intInfo); - dataoffset = 2 + intLength; - if (intLength < 3) { - length = readUInt(buffer.slice(offset + 2, offset + 2 + intLength)); - } else { - length = readUInt(buffer.slice(offset + 2, offset + 2 + intLength)); - } - } - if (length < maxObjectSize) { - return buffer.slice(offset + dataoffset, offset + dataoffset + length); - } - throw new Error("Too little heap space available! Wanted to read " + length + " bytes, but only " + maxObjectSize + " are available."); - } - - function parsePlistString(isUtf16) { - isUtf16 = isUtf16 || 0; - let enc = "utf8"; - let length = objInfo; - let stroffset = 1; - if (objInfo == 0xF) { - const int_type = buffer[offset + 1]; - const intType = (int_type & 0xF0) / 0x10; - if (intType != 0x1) { - console.error("UNEXPECTED LENGTH-INT TYPE! " + intType); - } - const intInfo = int_type & 0x0F; - const intLength = Math.pow(2, intInfo); - stroffset = 2 + intLength; - if (intLength < 3) { - length = readUInt(buffer.slice(offset + 2, offset + 2 + intLength)); - } else { - length = readUInt(buffer.slice(offset + 2, offset + 2 + intLength)); - } - } - // length is String length -> to get byte length multiply by 2, as 1 character takes 2 bytes in UTF-16 - length *= (isUtf16 + 1); - if (length < maxObjectSize) { - let plistString = Buffer.from(buffer.slice(offset + stroffset, offset + stroffset + length)); - if (isUtf16) { - plistString = swapBytes(plistString); - enc = "ucs2"; - } - return plistString.toString(enc); - } - throw new Error("Too little heap space available! Wanted to read " + length + " bytes, but only " + maxObjectSize + " are available."); - } - - function parseArray() { - let length = objInfo; - let arrayoffset = 1; - if (objInfo == 0xF) { - const int_type = buffer[offset + 1]; - const intType = (int_type & 0xF0) / 0x10; - if (intType != 0x1) { - console.error("0xa: UNEXPECTED LENGTH-INT TYPE! " + intType); - } - const intInfo = int_type & 0x0F; - const intLength = Math.pow(2, intInfo); - arrayoffset = 2 + intLength; - if (intLength < 3) { - length = readUInt(buffer.slice(offset + 2, offset + 2 + intLength)); - } else { - length = readUInt(buffer.slice(offset + 2, offset + 2 + intLength)); - } - } - if (length * objectRefSize > maxObjectSize) { - throw new Error("Too little heap space available!"); - } - const array = []; - for (let i = 0; i < length; i++) { - const objRef = readUInt(buffer.slice(offset + arrayoffset + i * objectRefSize, offset + arrayoffset + (i + 1) * objectRefSize)); - array[i] = parseObject(objRef); - } - return array; - } - - function parseDictionary() { - let length = objInfo; - let dictoffset = 1; - if (objInfo == 0xF) { - const int_type = buffer[offset + 1]; - const intType = (int_type & 0xF0) / 0x10; - if (intType != 0x1) { - console.error("0xD: UNEXPECTED LENGTH-INT TYPE! " + intType); - } - const intInfo = int_type & 0x0F; - const intLength = Math.pow(2, intInfo); - dictoffset = 2 + intLength; - if (intLength < 3) { - length = readUInt(buffer.slice(offset + 2, offset + 2 + intLength)); - } else { - length = readUInt(buffer.slice(offset + 2, offset + 2 + intLength)); - } - } - if (length * 2 * objectRefSize > maxObjectSize) { - throw new Error("Too little heap space available!"); - } - if (debug) { - console.log("Parsing dictionary #" + tableOffset); - } - const dict = {}; - for (let i = 0; i < length; i++) { - const keyRef = readUInt(buffer.slice(offset + dictoffset + i * objectRefSize, offset + dictoffset + (i + 1) * objectRefSize)); - const valRef = readUInt(buffer.slice(offset + dictoffset + (length * objectRefSize) + i * objectRefSize, offset + dictoffset + (length * objectRefSize) + (i + 1) * objectRefSize)); - const key = parseObject(keyRef); - const val = parseObject(valRef); - if (debug) { - console.log(" DICT #" + tableOffset + ": Mapped " + key + " to " + val); - } - dict[key] = val; - } - return dict; - } - } - - return parseObject(topObject) -}; - -function readUInt(buffer, start) { - start = start || 0; - - let l = 0; - for (let i = start; i < buffer.length; i++) { - l <<= 8; - l |= buffer[i] & 0xFF; - } - return l; -} - -// we're just going to toss the high order bits because javascript doesn't have 64-bit ints -function readUInt64BE(buffer, start) { - const data = buffer.slice(start, start + 8); - return data.readUInt32BE(4, 8); -} - -function swapBytes(buffer) { - const len = buffer.length; - for (let i = 0; i < len; i += 2) { - const a = buffer[i]; - buffer[i] = buffer[i + 1]; - buffer[i + 1] = a; - } - return buffer; -} diff --git a/helpers/scanner/scan.mjs b/helpers/scanner/scan.mjs deleted file mode 100644 index 7b23ce9..0000000 --- a/helpers/scanner/scan.mjs +++ /dev/null @@ -1,475 +0,0 @@ -import { Buffer } from 'buffer/index.js' -import prettyBytes from 'pretty-bytes' -import * as zip from '@zip.js/zip.js' - -import * as FileApi from './file-api.js' -import { isString, isNonEmptyString } from '../check-types.js' -import { parsePlistBuffer } from './parsers/plist.js' -import { extractMachoMeta } from './parsers/macho.js' - -// https://gildas-lormeau.github.io/zip.js/core-api.html#configuration -zip.configure({ - // Disable Web Workers for SSR since Node doesn't support them yet - // https://vitejs.dev/guide/env-and-mode.html#env-variables - useWebWorkers: !import.meta.env.SSR -}) - - -function makeNodeFileBuffer ( buffer ) { - const fileBuffer = new Buffer.alloc( buffer.byteLength ) - - for (var i = 0; i < buffer.length; i++) - fileBuffer[i] = buffer[i]; - - // console.log( 'this.machoFileInstance', this.machoFileInstance.buffer.byteLength ) - - return fileBuffer -} - -export class AppScan { - constructor ({ - fileLoader, - messageReceiver - }) { - - this.fileLoader = fileLoader - this.messageReceiver = messageReceiver - - this.status = 'idle' - this.file = null - this.bundleFileEntries = [] - this.infoPlist = {} - this.machoExcutables = [] - - // Data that is derived after we've read the files and pulled out the infoPlist - this.appVersion = '' - this.displayName = '' - this.details = [] - this.bundleExecutable = null - this.displayBinarySize = '' - this.binarySize = 0 - this.machoMeta = {} - this.binarySupportsNative = undefined - - this.info = {} - } - - sendMessage ( details ) { - if ( details?.status ) { - this.status = details.status - } - - if ( typeof( this.messageReceiver ) === 'function' ) { - this.messageReceiver( details ) - } - } - - get hasInfoPlist () { - return Object.keys( this.infoPlist ).length > 0 - } - - get hasMachoMeta () { - return Object.keys( this.machoMeta ).length > 0 - } - - get hasInfo () { - return Object.keys( this.info ).length > 0 - } - - get bundleExecutablePath () { - if ( !this.hasInfoPlist ) return '' - - // There our CFBundleExecutable is a path to the executable - // then use it - if ( this.infoPlist.CFBundleExecutable.includes('/') ) return `/Contents/${ this.infoPlist.CFBundleExecutable }` - - // Use default executable path - return `/Contents/MacOS/${ this.infoPlist.CFBundleExecutable }` - } - - get supportedArchitectures () { - if ( !this.hasMachoMeta ) return [] - - return this.machoMeta.architectures.filter( architecture => architecture.processorType !== 0 ) - } - - async readFileEntryData ( fileEntry, Writer = zip.TextWriter ) { - // Get blob data from zip - // https://gildas-lormeau.github.io/zip.js/core-api.html#zip-entry - return await fileEntry.getData( - // writer - // https://gildas-lormeau.github.io/zip.js/core-api.html#zip-writing - new Writer()//zip.TextWriter(), - ) - } - - async loadFile () { - // If fileLoader is no a function - // then try to load the file - if ( typeof( this.fileLoader ) !== 'function' ) { - return this.fileLoader - } - - const file = this.fileLoader() - - // Check if our file is a Promise - // if it is then await it - if ( file instanceof Promise || typeof file?.then === 'function' ) { - return await file - } - - return file - } - - getZipFileReader ( FileInstance ) { - // Check if file is a Blob, typically in the Browser - // otherwise convert it to a Blob, like in Node - // Both Browser and Node have Blob - // Node/Our File Polyfill references .arrayBuffer as a property - // Browser currently references .arrayBuffer as an async method - if ( FileInstance instanceof Blob ) { - return new zip.BlobReader( FileInstance ) - } - - if ( FileInstance instanceof ArrayBuffer ) { - return new zip.Uint8ArrayReader( FileInstance ) - } - - // return new zip.Uint8ArrayReader( new Uint8Array( FileInstance.arrayBuffer ) ) - // const FileBlob = FileInstance instanceof Blob ? FileInstance : new Blob( FileInstance.arrayBuffer ) - - throw new Error( 'FileInstance is not a known format' ) - } - - async readFileBlob ( FileInstance ) { - return new Promise( async ( resolve, reject ) => { - - console.log( 'FileInstance', FileInstance ) - - const binaryReader = this.getZipFileReader( FileInstance ) //new zip.BlobReader( FileBlob ) - - // https://gildas-lormeau.github.io/zip.js/core-api.html#zip-reading - const zipReader = new zip.ZipReader( binaryReader ) - - zipReader - .getEntries() - .then( entries => { - - // do something on entries - this.sendMessage({ - message: '📖 Reading file complete. Entries found', - status: 'read' - }) - - resolve( entries ) - }) - .catch( error => { - reject( error ) - }) - - }) - } - - classifyBinaryEntryArchitecture ( binaryEntry ) { - // Find an ARM Architecture - const armArchitecture = binaryEntry.architectures.find( architecture => { - // if ( architecture.processorType === 0 ) return false - - // If processorType not a string - // then return false - if ( !isString(architecture.processorType) ) return false - - return architecture.processorType.toLowerCase().includes('arm') - }) - - // Was an ARM Architecture found - return (armArchitecture !== undefined) - } - - matchesMachoExecutable ( entry ) { - // Skip files that are deeper than 3 folders - if ( entry.filename.split('/').length > 4 ) return false - - // Skip folders - // if ( !!entry.directory ) return false - - // `${ appName }.app/Contents/MacOS/${ appName }` - // Does this entry path match any of our wanted paths - return [ - // `${ appName }.app/Contents/MacOS/${ appName }` - // `.app/Contents/MacOS/`, - `Contents/MacOS/` - ].some( pathToMatch => { - return entry.filename.includes( pathToMatch ) - }) - } - - matchesRootInfoPlist ( entry ) { - // Skip files that are deeper than 2 folders - if ( entry.filename.split('/').length > 3 ) return false - - // Skip folders - if ( entry.filename.endsWith('/') ) return false - - // If this entry matches the root info.plist path exactly - // then we have found the root info.plist - if ( entry.filename === 'Contents/Info.plist' ) return true - - // Does this entry path match any of our wanted paths - return [ - // `zoom.us.app/Contents/Info.plist` - `.app/Contents/Info.plist`, - `.zip/Contents/Info.plist` - ].some( pathToMatch => { - return entry.filename.endsWith( pathToMatch ) - }) - } - - fileEntryType ( fileEntry ) { - if ( !!fileEntry.directory ) return 'directory' - - if ( this.matchesMachoExecutable( fileEntry ) ) return 'machoExecutable' - - if ( this.matchesRootInfoPlist( fileEntry ) ) return 'rootInfoPlist' - - // getData - - return 'unknown' - } - - storeInfoPlist = async ( fileEntry ) => { - // Throw if we have more than one target file - if ( this.hasInfoPlist ) { - throw new Error( 'More than one root info.plist found' ) - } - - const infoUint8Array = await this.readFileEntryData( fileEntry, zip.Uint8ArrayWriter ) - // console.log( 'infoUint8Array', infoUint8Array ) - - const infoNodeBuffer = makeNodeFileBuffer( infoUint8Array ) - - // Parse the Info.plist data - this.infoPlist = await parsePlistBuffer( infoNodeBuffer ) - - this.sendMessage({ - message: 'ℹ️ Found Info.plist', - // data: this.infoPlist - }) - } - - storeMachoExecutable = ( fileEntry ) => { - this.machoExcutables.push( fileEntry ) - - this.sendMessage({ - message: '🥊 Found a Macho executable', - // data: machoExecutable, - }) - } - - storeResultInfo () { - this.info = { - filename: this.file.name, - appVersion: this.appVersion, - result: this.binarySupportsNative ? '✅' : '🔶', - machoMeta: { - ...this.machoMeta, - file: undefined, - architectures: this.machoMeta.architectures.map( architecture => { - return { - bits: architecture.bits, - fileType: architecture.fileType, - header: architecture.header, - loadCommandsInfo: architecture.loadCommandsInfo, - magic: architecture.magic, - offset: architecture.offset, - processorSubType: architecture.processorSubType, - processorType: architecture.processorType, - } - }) - }, - infoPlist: this.infoPlist, - } - } - - storeMachoMeta = async ( fileEntry ) => { - // Throw if we have more than one target file - if ( this.hasMachoMeta ) { - throw new Error( 'More than one primary Macho executable found' ) - } - - // Get zip as Uint8Array - const bundleExecutableUint8Array = await this.readFileEntryData( fileEntry, zip.Uint8ArrayWriter ) - - const machoFileInstance = new FileApi.File({ - name: this.bundleExecutable.filename, - type: 'application/x-mach-binary', - buffer: Buffer.from( bundleExecutableUint8Array ) - }) - - // Get zip as blob - // so we can use it in for the File API when we're in the browser context - // https://gildas-lormeau.github.io/zip.js/core-api.html#zip-entry - machoFileInstance.blob = await this.readFileEntryData( fileEntry, zip.BlobWriter ) - - this.machoMeta = await extractMachoMeta({ - machoFileInstance, - FileApi - }) - - // console.log( 'this.machoMeta', this.machoMeta ) - - } - - - targetFiles = { - rootInfoPlist: { - method: this.storeInfoPlist - }, - machoExecutable: { - method: this.storeMachoExecutable, - } - } - - findMainExecutable () { - - // Now that we have the info.plist Determine our entry Macho Executable from the list of Macho Executables - const bundleExecutables = this.machoExcutables.filter( machoEntry => { - - if ( machoEntry.filename.includes( this.bundleExecutablePath ) ) { - return true - } - - return this.bundleExecutablePath.includes( machoEntry.filename ) - }) - - // Warn if Bundle Executable doesn't look right - if ( bundleExecutables.length > 1) { - throw new Error('More than one root bundleExecutable found', bundleExecutables) - } else if ( bundleExecutables.length === 0 ) { - throw new Error('No root bundleExecutable found', bundleExecutables) - } - - return bundleExecutables[ 0 ] - } - - async findTargetFiles () { - - for ( const fileEntry of this.bundleFileEntries ) { - const type = this.fileEntryType( fileEntry ) - - // console.log( 'fileEntry', type, fileEntry.filename ) - - // Check if we have a target file - if ( this.targetFiles[ type ] ) { - // console.log( 'fileEntry', type, fileEntry.filename ) - - // Call the target file method - await this.targetFiles[ type ].method( fileEntry ) - } - - // console.log( 'File Entry Type:', type ) - } - - // Now that we have the info.plist Determine our entry Macho Executable from the list of Macho Executables - - // Find valid app version that is a string but not empty - this.appVersion = ([ - this.infoPlist.CFBundleShortVersionString, - this.infoPlist.CFBundleVersion - ]).find( isNonEmptyString )[0] - - // Find Display Name that is a string but not empty - this.displayName = ([ - this.infoPlist.CFBundleDisplayName, - this.infoPlist.CFBundleName, - this.infoPlist.CFBundleExecutable, - ]).find( isNonEmptyString )[0] - - // We loop through possible details and add them to the details array - // if they are not empty - ;([ - [ 'Version', this.infoPlist.CFBundleShortVersionString ], - [ 'Bundle Identifier', this.infoPlist.CFBundleIdentifier ], - [ 'File Mime Type', this.file.type ], - [ 'Copyright', this.infoPlist.NSHumanReadableCopyright ], - // [ 'Version', info.CFBundleShortVersionString ], - ]).forEach( ([ label, value ]) => { - if ( !value || value.length === 0 ) return - - this.details.push({ - label, - value, - }) - } ) - - // Set the bundleExecutable - this.bundleExecutable = this.findMainExecutable() - - console.log('Parsing ', this.bundleExecutable.filename, this.bundleExecutable.uncompressedSize / 1000 ) - - this.displayBinarySize = prettyBytes( this.bundleExecutable.uncompressedSize ) - this.binarySize = this.bundleExecutable.uncompressedSize - - - await this.storeMachoMeta( this.bundleExecutable ) - - this.binarySupportsNative = this.classifyBinaryEntryArchitecture( this.machoMeta ) - } - - async runScan () { - // Load in the file - this.sendMessage({ - message: '🚛 Loading file...', - status: 'loading' - }) - - - this.file = await this.loadFile() - - this.sendMessage({ - message: '📚 Extracting from archive...', - status: 'scanning', - data: this.file - }) - - this.bundleFileEntries = await this.readFileBlob( this.file ) - - this.sendMessage({ - message: '🎬 Starting scan', - status: 'scanning' - }) - - await this.findTargetFiles() - - this.storeResultInfo() - - this.sendMessage({ - message: '🔎 Checking online for native versions...', - status: 'checking' - }) - - // Sleep for 3 seconds - // await new Promise( resolve => setTimeout( resolve, 3000 ) ) - - this.sendMessage({ - message: '🏁 Scan complete! ', - status: 'finished' - }) - } - - async start () { - - try { - - await this.runScan() - - } catch ( error ) { - this.sendMessage({ - message: '🚫 Error: ' + error.message, - status: 'finished', - error - }) - } - - } -} diff --git a/helpers/scanner/scan.ts b/helpers/scanner/scan.ts new file mode 100644 index 0000000..f718bad --- /dev/null +++ b/helpers/scanner/scan.ts @@ -0,0 +1,539 @@ +import { Buffer } from 'buffer/index.js' +import prettyBytes from 'pretty-bytes' +import * as zip from '@zip.js/zip.js' + +import * as FileApi from './file-api' +import type { NodeFile } from './file-api' +import { isNonEmptyString, isString } from '../check-types.js' +import { extractMachoMeta } from './parsers/macho.js' +import { parsePlistBuffer } from './parsers/plist-parser' + +zip.configure({ + useWebWorkers: !import.meta.env.SSR +}) + +type MaybePromise = Promise | T + +type ScanStatus = 'idle' | 'loading' | 'read' | 'scanning' | 'checking' | 'finished' + +type FileArrayBuffer = ArrayBuffer + +export interface ScanFileLike { + arrayBuffer?: FileArrayBuffer | (() => Promise) + blob?: Blob + buffer?: ArrayBuffer | ArrayBufferView + mimeType?: string + name: string + size?: number + type?: string +} + +export interface ScanDetail { + label: string + value: string +} + +export interface ScanArchitecture { + bits?: unknown + fileType?: unknown + header?: unknown + loadCommandsInfo?: unknown + magic?: unknown + offset?: unknown + processorSubType?: unknown + processorType?: unknown +} + +export interface ScanMachoMeta { + architectures: ScanArchitecture[] + [ key: string ]: unknown +} + +export interface ScanInfo { + appVersion: string + filename: string + infoPlist: Record + machoMeta: ScanMachoMeta | null + result: '✅' | '🔶' +} + +export interface AppScanSnapshot { + appVersion: string + binarySize: number + binarySupportsNative: boolean + details: ScanDetail[] + displayBinarySize: string + displayName: string + hasInfo: boolean + hasInfoPlist: boolean + hasMachoMeta: boolean + info: ScanInfo + infoPlist: Record + machoMeta: ScanMachoMeta + status: ScanStatus + supportedArchitectures: ScanArchitecture[] +} + +export interface ScanMessage { + data?: unknown + error?: unknown + message?: string + status: ScanStatus +} + +interface ScanFileEntry { + directory?: boolean + filename: string + getData: ( writer: unknown ) => Promise + uncompressedSize: number +} + +interface ScanMachoFileInstance { + blob?: Blob + buffer: NodeFile['buffer'] + name: string + type: string +} + +interface AppScanOptions { + fileLoader: (() => MaybePromise) | ArrayBuffer | Blob | ScanFileLike + messageReceiver?: ( details: ScanMessage ) => void +} + +function makeNodeFileBuffer ( buffer: Uint8Array ) { + const fileBuffer = Buffer.alloc( buffer.byteLength ) + + for ( let index = 0; index < buffer.length; index += 1 ) { + fileBuffer[ index ] = buffer[ index ] + } + + return fileBuffer +} + +function toArrayBuffer ( value: ArrayBuffer | ArrayBufferView ) { + if ( value instanceof ArrayBuffer ) { + return value + } + + return value.buffer.slice( + value.byteOffset, + value.byteOffset + value.byteLength + ) +} + +function isBlob ( value: unknown ): value is Blob { + return typeof Blob === 'function' && value instanceof Blob +} + +function firstNonEmptyString ( values: unknown[] ) { + const match = values.find( value => isNonEmptyString( value ) ) + + return typeof match === 'string' ? match : '' +} + +function isPromiseLike ( value: unknown ): value is PromiseLike { + return Boolean( value ) && typeof ( value as PromiseLike ).then === 'function' +} + +export class AppScan { + fileLoader: AppScanOptions['fileLoader'] + messageReceiver?: ( details: ScanMessage ) => void + status: ScanStatus + file: ArrayBuffer | Blob | ScanFileLike | null + bundleFileEntries: ScanFileEntry[] + infoPlist: Record + machoExcutables: ScanFileEntry[] + appVersion: string + displayName: string + details: ScanDetail[] + bundleExecutable: ScanFileEntry | null + displayBinarySize: string + binarySize: number + machoMeta: ScanMachoMeta + binarySupportsNative: boolean + info: ScanInfo + + constructor ( { + fileLoader, + messageReceiver + }: AppScanOptions ) { + this.fileLoader = fileLoader + this.messageReceiver = messageReceiver + + this.status = 'idle' + this.file = null + this.bundleFileEntries = [] + this.infoPlist = {} + this.machoExcutables = [] + + this.appVersion = '' + this.displayName = '' + this.details = [] + this.bundleExecutable = null + this.displayBinarySize = '' + this.binarySize = 0 + this.machoMeta = { + architectures: [] + } + this.binarySupportsNative = false + + this.info = { + appVersion: '', + filename: '', + infoPlist: {}, + machoMeta: null, + result: '🔶' + } + } + + sendMessage ( details: ScanMessage ) { + if ( details.status ) { + this.status = details.status + } + + if ( typeof this.messageReceiver === 'function' ) { + this.messageReceiver( details ) + } + } + + get hasInfoPlist () { + return Object.keys( this.infoPlist ).length > 0 + } + + get hasMachoMeta () { + return this.machoMeta.architectures.length > 0 + } + + get hasInfo () { + return this.info.filename.length > 0 + } + + get bundleExecutablePath () { + const bundleExecutable = this.infoPlist.CFBundleExecutable + + if ( !isNonEmptyString( bundleExecutable ) ) return '' + + const executablePath = String( bundleExecutable ) + + if ( executablePath.includes( '/' ) ) return `/Contents/${ executablePath }` + + return `/Contents/MacOS/${ executablePath }` + } + + get supportedArchitectures () { + return this.machoMeta.architectures.filter( architecture => architecture.processorType !== 0 ) + } + + async readFileEntryData ( fileEntry: ScanFileEntry, Writer: new () => T = zip.TextWriter as new () => T ) { + return await fileEntry.getData( + new Writer() + ) + } + + async loadFile (): Promise { + if ( typeof this.fileLoader !== 'function' ) { + return this.fileLoader + } + + const file = this.fileLoader() + + if ( file instanceof Promise || isPromiseLike( file ) ) { + return await file as ArrayBuffer | Blob | ScanFileLike + } + + return file + } + + async getZipFileReader ( fileInstance: ArrayBuffer | Blob | ScanFileLike ) { + if ( isBlob( fileInstance ) ) { + return new zip.BlobReader( fileInstance ) + } + + if ( fileInstance instanceof ArrayBuffer ) { + return new zip.Uint8ArrayReader( new Uint8Array( fileInstance ) ) + } + + if ( isBlob( fileInstance.blob ) ) { + return new zip.BlobReader( fileInstance.blob ) + } + + if ( typeof fileInstance.arrayBuffer === 'function' ) { + return new zip.Uint8ArrayReader( new Uint8Array( await fileInstance.arrayBuffer() ) ) + } + + if ( fileInstance.arrayBuffer instanceof ArrayBuffer ) { + return new zip.Uint8ArrayReader( new Uint8Array( fileInstance.arrayBuffer ) ) + } + + if ( fileInstance.buffer instanceof ArrayBuffer ) { + return new zip.Uint8ArrayReader( new Uint8Array( fileInstance.buffer ) ) + } + + if ( ArrayBuffer.isView( fileInstance.buffer ) ) { + return new zip.Uint8ArrayReader( new Uint8Array( toArrayBuffer( fileInstance.buffer ) ) ) + } + + throw new Error( 'FileInstance is not a known format' ) + } + + async readFileBlob ( fileInstance: ArrayBuffer | Blob | ScanFileLike ) { + const binaryReader = await this.getZipFileReader( fileInstance ) + const zipReader = new zip.ZipReader( binaryReader ) + const entries = await zipReader.getEntries() + + this.sendMessage({ + message: '📖 Reading file complete. Entries found', + status: 'read' + }) + + return entries as ScanFileEntry[] + } + + classifyBinaryEntryArchitecture ( binaryEntry: ScanMachoMeta ) { + const armArchitecture = binaryEntry.architectures.find( architecture => { + if ( !isString( architecture.processorType ) ) return false + + return architecture.processorType.toLowerCase().includes( 'arm' ) + } ) + + return armArchitecture !== undefined + } + + matchesMachoExecutable ( entry: ScanFileEntry ) { + if ( entry.filename.split( '/' ).length > 4 ) return false + + return [ + 'Contents/MacOS/' + ].some( pathToMatch => { + return entry.filename.includes( pathToMatch ) + } ) + } + + matchesRootInfoPlist ( entry: ScanFileEntry ) { + if ( entry.filename.split( '/' ).length > 3 ) return false + if ( entry.filename.endsWith( '/' ) ) return false + if ( entry.filename === 'Contents/Info.plist' ) return true + + return [ + '.app/Contents/Info.plist', + '.zip/Contents/Info.plist' + ].some( pathToMatch => { + return entry.filename.endsWith( pathToMatch ) + } ) + } + + fileEntryType ( fileEntry: ScanFileEntry ) { + if ( fileEntry.directory ) return 'directory' + if ( this.matchesMachoExecutable( fileEntry ) ) return 'machoExecutable' + if ( this.matchesRootInfoPlist( fileEntry ) ) return 'rootInfoPlist' + + return 'unknown' + } + + storeInfoPlist = async ( fileEntry: ScanFileEntry ) => { + if ( this.hasInfoPlist ) { + throw new Error( 'More than one root info.plist found' ) + } + + const infoUint8Array = await this.readFileEntryData>( fileEntry, zip.Uint8ArrayWriter as new () => InstanceType ) as Uint8Array + const infoNodeBuffer = makeNodeFileBuffer( infoUint8Array ) + + this.infoPlist = await parsePlistBuffer( infoNodeBuffer ) as Record + + this.sendMessage({ + message: 'ℹ️ Found Info.plist', + status: this.status + }) + } + + storeMachoExecutable = ( fileEntry: ScanFileEntry ) => { + this.machoExcutables.push( fileEntry ) + + this.sendMessage({ + message: '🥊 Found a Macho executable', + status: this.status + }) + } + + storeResultInfo () { + this.info = { + appVersion: this.appVersion, + filename: this.file && 'name' in this.file && typeof this.file.name === 'string' ? this.file.name : '', + infoPlist: this.infoPlist, + machoMeta: this.hasMachoMeta ? { + ...this.machoMeta, + architectures: this.machoMeta.architectures.map( architecture => { + return { + bits: architecture.bits, + fileType: architecture.fileType, + header: architecture.header, + loadCommandsInfo: architecture.loadCommandsInfo, + magic: architecture.magic, + offset: architecture.offset, + processorSubType: architecture.processorSubType, + processorType: architecture.processorType + } + }) + } : null, + result: this.binarySupportsNative ? '✅' : '🔶' + } + } + + storeMachoMeta = async ( fileEntry: ScanFileEntry ) => { + if ( this.hasMachoMeta ) { + throw new Error( 'More than one primary Macho executable found' ) + } + + if ( !this.bundleExecutable ) { + throw new Error( 'No root bundleExecutable found' ) + } + + const bundleExecutableUint8Array = await this.readFileEntryData>( fileEntry, zip.Uint8ArrayWriter as new () => InstanceType ) as Uint8Array + + const machoFileInstance = new FileApi.File({ + buffer: Buffer.from( bundleExecutableUint8Array ) as unknown as NodeFile['buffer'], + name: this.bundleExecutable.filename, + type: 'application/x-mach-binary' + }) as unknown as ScanMachoFileInstance + + machoFileInstance.blob = await this.readFileEntryData>( fileEntry, zip.BlobWriter as new () => InstanceType ) as Blob + + const machoMeta = await extractMachoMeta({ + FileApi, + machoFileInstance + }) as ScanMachoMeta | null + + if ( !machoMeta || !Array.isArray( machoMeta.architectures ) ) { + throw new Error( 'Unable to read Mach-O metadata' ) + } + + this.machoMeta = machoMeta + } + + targetFiles = { + machoExecutable: { + method: this.storeMachoExecutable + }, + rootInfoPlist: { + method: this.storeInfoPlist + } + } + + findMainExecutable () { + const bundleExecutables = this.machoExcutables.filter( machoEntry => { + if ( machoEntry.filename.includes( this.bundleExecutablePath ) ) { + return true + } + + return this.bundleExecutablePath.includes( machoEntry.filename ) + } ) + + if ( bundleExecutables.length > 1 ) { + throw new Error( 'More than one root bundleExecutable found' ) + } + + if ( bundleExecutables.length === 0 ) { + throw new Error( 'No root bundleExecutable found' ) + } + + return bundleExecutables[ 0 ] + } + + async findTargetFiles () { + for ( const fileEntry of this.bundleFileEntries ) { + const type = this.fileEntryType( fileEntry ) as keyof typeof this.targetFiles | 'directory' | 'unknown' + + if ( type in this.targetFiles ) { + await this.targetFiles[ type as keyof typeof this.targetFiles ].method( fileEntry ) + } + } + + this.appVersion = firstNonEmptyString( [ + this.infoPlist.CFBundleShortVersionString, + this.infoPlist.CFBundleVersion + ] ) + + this.displayName = firstNonEmptyString( [ + this.infoPlist.CFBundleDisplayName, + this.infoPlist.CFBundleName, + this.infoPlist.CFBundleExecutable + ] ) + + ;([ + [ 'Version', this.infoPlist.CFBundleShortVersionString ], + [ 'Bundle Identifier', this.infoPlist.CFBundleIdentifier ], + [ 'File Mime Type', this.file && 'type' in this.file ? this.file.type : '' ], + [ 'Copyright', this.infoPlist.NSHumanReadableCopyright ] + ] as Array<[ string, unknown ]>).forEach( ( [ label, value ] ) => { + if ( !isNonEmptyString( value ) ) return + + this.details.push({ + label, + value: String( value ) + }) + } ) + + this.bundleExecutable = this.findMainExecutable() + + this.displayBinarySize = prettyBytes( this.bundleExecutable.uncompressedSize ) + this.binarySize = this.bundleExecutable.uncompressedSize + + await this.storeMachoMeta( this.bundleExecutable ) + + this.binarySupportsNative = this.classifyBinaryEntryArchitecture( this.machoMeta ) + } + + async runScan () { + this.sendMessage({ + message: '🚛 Loading file...', + status: 'loading' + }) + + this.file = await this.loadFile() + + this.sendMessage({ + data: this.file, + message: '📚 Extracting from archive...', + status: 'scanning' + }) + + this.bundleFileEntries = await this.readFileBlob( this.file ) + + this.sendMessage({ + message: '🎬 Starting scan', + status: 'scanning' + }) + + await this.findTargetFiles() + + this.storeResultInfo() + + this.sendMessage({ + message: '🔎 Checking online for native versions...', + status: 'checking' + }) + } + + toSnapshot (): AppScanSnapshot { + return { + appVersion: this.appVersion, + binarySize: this.binarySize, + binarySupportsNative: this.binarySupportsNative, + details: this.details, + displayBinarySize: this.displayBinarySize, + displayName: this.displayName, + hasInfo: this.hasInfo, + hasInfoPlist: this.hasInfoPlist, + hasMachoMeta: this.hasMachoMeta, + info: this.info, + infoPlist: this.infoPlist, + machoMeta: this.machoMeta, + status: 'finished', + supportedArchitectures: this.supportedArchitectures + } + } + + async start () { + await this.runScan() + } +} diff --git a/helpers/scanner/worker.mjs b/helpers/scanner/worker.mjs deleted file mode 100644 index a743f2a..0000000 --- a/helpers/scanner/worker.mjs +++ /dev/null @@ -1,43 +0,0 @@ -import { AppScan } from './scan.mjs' - -self.onmessage = ( event ) => { - - console.log( 'Worker received message', event ) - - const { status } = event.data - - // https://developer.mozilla.org/en-US/docs/Web/API/Worker/postMessage - // self.postMessage( event ) - - - if ( status === 'start' ) { - // Get Scan Options - const { options } = event.data - - // console.log( 'options', options ) - - const scan = new AppScan({ - fileLoader: options.file, - // Use self.postMessage as the message callback - messageReceiver: ( details ) => { - self.postMessage( details ) - } - }) - - scan.start() - .then( () => { - self.postMessage( { - status: 'finished', - // Convert App Scan instance to a more primitive Object - // so that it's clonneable for our worker - scan: JSON.parse(JSON.stringify( scan )) - }) - }) - - return - } - - - self.postMessage( { status: 'finished' } ) - return -} diff --git a/helpers/scanner/worker.ts b/helpers/scanner/worker.ts new file mode 100644 index 0000000..ccd1215 --- /dev/null +++ b/helpers/scanner/worker.ts @@ -0,0 +1,72 @@ +/// + +import { + AppScan, + type AppScanSnapshot, + type ScanFileLike, + type ScanMessage +} from './scan' + +type WorkerRequest = + | { + options: { + file: ScanFileLike + } + status: 'start' + } + | { + status: string + } + +type WorkerResponse = + | ScanMessage + | { + error?: { + message?: string + } + message?: string + scan?: AppScanSnapshot + status: 'finished' + } + +const workerScope = self as unknown as DedicatedWorkerGlobalScope + +function isStartRequest ( request: WorkerRequest ): request is Extract { + return request.status === 'start' +} + +workerScope.onmessage = async ( event: MessageEvent ) => { + if ( !isStartRequest( event.data ) ) { + workerScope.postMessage( { + status: 'finished' + } satisfies WorkerResponse ) + return + } + + const { options } = event.data + const scan = new AppScan({ + fileLoader: options.file, + messageReceiver: ( details ) => { + workerScope.postMessage( details satisfies WorkerResponse ) + } + }) + + try { + await scan.start() + + workerScope.postMessage( { + scan: scan.toSnapshot(), + status: 'finished' + } satisfies WorkerResponse ) + } catch ( error ) { + const message = error instanceof Error ? error.message : String( error ) + + workerScope.postMessage( { + error: { + message + }, + message: `🚫 Error: ${ message }`, + status: 'finished' + } satisfies WorkerResponse ) + } +} diff --git a/package.json b/package.json index fc6f0f6..50e3da1 100644 --- a/package.json +++ b/package.json @@ -31,8 +31,8 @@ "test-vitest": "vitest", "test": "vitest run", "test:browser": "vitest run --config vitest.playwright.config.mjs", - "test:browser:pagefind": "vitest run --config vitest.playwright.config.mjs test/playwright/pagefind-native-filter.playwright.js", - "test:browser:pagefind:live": "PLAYWRIGHT_BASE_URL=https://doesitarm.com vitest run --config vitest.playwright.config.mjs test/playwright/pagefind-native-filter.playwright.js", + "test:browser:pagefind": "vitest run --config vitest.playwright.config.mjs test/playwright/pagefind-native-filter.playwright.ts", + "test:browser:pagefind:live": "PLAYWRIGHT_BASE_URL=https://doesitarm.com vitest run --config vitest.playwright.config.mjs test/playwright/pagefind-native-filter.playwright.ts", "dev": "pnpm run dev-astro", "build": "pnpm run generate-astro", "build-api": "pnpm run clone-readme && pnpm exec vite-node build-lists.js -- --with-api --no-lists", diff --git a/pages/apple-silicon-app-test.vue b/pages/apple-silicon-app-test.vue index 27ff275..4f34254 100644 --- a/pages/apple-silicon-app-test.vue +++ b/pages/apple-silicon-app-test.vue @@ -206,8 +206,11 @@ export default { } }, computed: { + fallbackMacsVerbiage () { + return 'Apple M4 Max or M3 Ultra Mac' + }, npm_package_config_verbiage_macs () { - return this.config.macsVerbiage //process.env.npm_package_config_verbiage_macs + return this.config?.macsVerbiage || this.$config?.macsVerbiage || this.fallbackMacsVerbiage }, foundFiles () { return this.appsBeingScanned.filter( appScan => { @@ -271,7 +274,7 @@ export default { return `Apple Silicon Compatibility Test Online` }, description () { - return `Check for Apple Silicon compatibility for any of your apps instantly before you buy an ${ this.$config.macsVerbiage }. ` + return `Check for Apple Silicon compatibility for any of your apps instantly before you buy an ${ this.npm_package_config_verbiage_macs }. ` } }, mounted () { diff --git a/scripts/build-pagefind-index.js b/scripts/build-pagefind-index.js index a86519c..afa72fa 100644 --- a/scripts/build-pagefind-index.js +++ b/scripts/build-pagefind-index.js @@ -1,31 +1,12 @@ -import fs from 'fs-extra' -import axios from 'axios' import 'dotenv/config.js' import { - sitemapEndpointsPath -} from '~/helpers/pagefind/config.js' + loadSitemapEndpoints +} from '~/helpers/pagefind/load-sitemap-endpoints' import { writePagefindIndex } from '~/helpers/pagefind/index.js' -async function loadSitemapEndpoints () { - if ( await fs.pathExists( sitemapEndpointsPath ) ) { - return await fs.readJson( sitemapEndpointsPath ) - } - - if ( !process.env.PUBLIC_API_DOMAIN ) { - throw new Error(`Missing ${ sitemapEndpointsPath } and PUBLIC_API_DOMAIN is not set`) - } - - const apiUrl = new URL( process.env.PUBLIC_API_DOMAIN ) - apiUrl.pathname = sitemapEndpointsPath.replace(/^\.?\/?static\//, '/') - - const response = await axios.get( apiUrl.toString() ) - - return response.data -} - ;(async () => { const sitemapEndpoints = await loadSitemapEndpoints() const { diff --git a/test/_disabled/scanner/client.test.mjs b/test/_disabled/scanner/client.test.mjs index 8304082..d9513ca 100644 --- a/test/_disabled/scanner/client.test.mjs +++ b/test/_disabled/scanner/client.test.mjs @@ -15,7 +15,7 @@ import glob from 'fast-glob' import { LocalFileData } from 'get-file-object-from-local-path' import { Zip } from 'zip-lib' -import { runScanWorker } from '~/helpers/scanner/client.mjs' +import { runScanWorker } from '~/helpers/scanner/client' const appGlobOptions = { @@ -142,4 +142,3 @@ describe.concurrent('Apps', async () => { }) - diff --git a/test/playwright/apple-silicon-app-test.playwright.ts b/test/playwright/apple-silicon-app-test.playwright.ts new file mode 100644 index 0000000..a542359 --- /dev/null +++ b/test/playwright/apple-silicon-app-test.playwright.ts @@ -0,0 +1,178 @@ +import type { Browser, Page } from 'playwright-core' +import { + afterAll, + beforeAll, + describe, + expect, + it +} from 'vitest' + +import { + launchBrowser, + startAstroDevServer, + stopChildProcess, + type AstroDevServer +} from './support/astro-browser-test' +import { + createNativeAppArchive, + type PlaywrightUploadFile +} from './support/app-archive-fixture' + +const appTestVariants = [ + { + name: 'legacy scanner', + routeSuffix: '' + }, + { + name: 'worker scanner', + routeSuffix: '?version=2' + } +] as const + +describe( 'Apple Silicon app test page', () => { + let browser: Browser + let devServer: AstroDevServer + let appArchive: PlaywrightUploadFile + + beforeAll( async () => { + appArchive = await createNativeAppArchive() + + devServer = await startAstroDevServer({ + env: { + TEST_RESULT_STORE: '/api/test-results' + }, + preferConfiguredBaseUrl: false + }) + + browser = await launchBrowser() + await Promise.all( appTestVariants.map( variant => { + return warmAppTestRoute( browser, devServer.baseUrl, variant.routeSuffix ) + } ) ) + } ) + + afterAll( async () => { + await browser?.close() + await stopChildProcess( devServer?.process || null ) + } ) + + it.each( appTestVariants )( 'uploads an app archive through the %s path and renders a native result', async ( variant ) => { + const page = await browser.newPage() + const consoleErrors: string[] = [] + const pageErrors: string[] = [] + const submittedScans: Record[] = [] + + page.on( 'console', message => { + if ( message.type() === 'error' ) { + consoleErrors.push( message.text() ) + } + } ) + + page.on( 'pageerror', error => { + pageErrors.push( error.message ) + } ) + + await stubResultStore( page, submittedScans ) + + await page.goto( `${ devServer.baseUrl }/apple-silicon-app-test/${ variant.routeSuffix }`, { + waitUntil: 'load' + } ) + + await page.waitForFunction( () => { + const island = document.querySelector( 'astro-island[component-url="/pages/apple-silicon-app-test.vue"]' ) + + return Boolean( island && !island.hasAttribute( 'ssr' ) ) + }, { + timeout: 30 * 1000 + } ) + + await page.locator( 'input[type="file"]' ).setInputFiles( appArchive ) + await waitForBodyText( page, 'Total Files: 1', { + consoleErrors, + devServerOutput: devServer.output.text, + pageErrors + } ) + + const firstScanRow = page.locator( '.results-container li' ).first() + + await waitForBodyText( page, 'Playwright Native App', { + consoleErrors, + devServerOutput: devServer.output.text, + pageErrors + } ) + await waitForBodyText( page, '✅ This app is natively compatible with Apple Silicon!', { + consoleErrors, + devServerOutput: devServer.output.text, + pageErrors + } ) + + await firstScanRow.locator( 'summary' ).click() + + const rowText = await firstScanRow.textContent() + + expect( rowText ).toContain( 'Bundle Identifier' ) + expect( rowText ).toContain( 'com.doesitarm.playwright-native-app' ) + + expect( submittedScans.length, devServer.output.text ).toBe( 1 ) + expect( submittedScans[ 0 ]?.filename, JSON.stringify( submittedScans[ 0 ] ) ).toBe( 'Playwright Native App.app.zip' ) + expect( submittedScans[ 0 ]?.result, JSON.stringify( submittedScans[ 0 ] ) ).toBe( '✅' ) + expect( pageErrors, devServer.output.text ).toEqual( [] ) + expect( consoleErrors, devServer.output.text ).toEqual( [] ) + } ) +} ) + +async function stubResultStore ( page: Page, submittedScans: Record[] ) { + await page.route( '**/api/test-results', async route => { + const postData = route.request().postDataJSON() + + if ( postData && typeof postData === 'object' ) { + submittedScans.push( postData as Record ) + } + + await route.fulfill({ + status: 200, + contentType: 'application/json', + body: JSON.stringify({ + supportedVersionNumber: null + }) + }) + } ) +} + +async function waitForBodyText ( page: Page, expectedText: string, debugContext: { + consoleErrors: string[] + devServerOutput: string + pageErrors: string[] +} ) { + try { + await page.waitForFunction( textToFind => { + return Boolean( document.body?.textContent?.includes( textToFind ) ) + }, expectedText, { + timeout: 30 * 1000 + } ) + } catch ( error ) { + const bodyText = await page.locator( 'body' ).textContent() + + throw new Error( [ + `Timed out waiting for body text: ${ expectedText }`, + bodyText || '', + debugContext.pageErrors.join( '\n' ), + debugContext.consoleErrors.join( '\n' ), + debugContext.devServerOutput + ].filter( Boolean ).join( '\n\n' ), { + cause: error + } ) + } +} + +async function warmAppTestRoute ( browser: Browser, baseUrl: string, routeSuffix = '' ) { + const warmPage = await browser.newPage() + + try { + await warmPage.goto( `${ baseUrl }/apple-silicon-app-test/${ routeSuffix }`, { + waitUntil: 'load' + } ) + await warmPage.waitForTimeout( 5000 ) + } finally { + await warmPage.close() + } +} diff --git a/test/playwright/pagefind-native-filter.playwright.js b/test/playwright/pagefind-native-filter.playwright.js deleted file mode 100644 index 5de7712..0000000 --- a/test/playwright/pagefind-native-filter.playwright.js +++ /dev/null @@ -1,273 +0,0 @@ -import { accessSync, constants } from 'node:fs' -import { spawn } from 'node:child_process' -import net from 'node:net' - -import { chromium } from 'playwright-core' -import { - afterAll, - beforeAll, - describe, - expect, - it -} from 'vitest' - - -const command = process.platform === 'win32' ? 'pnpm.cmd' : 'pnpm' -const host = '127.0.0.1' -const configuredBaseUrl = process.env.PLAYWRIGHT_BASE_URL || '' - -function canAccessPath ( filePath ) { - try { - accessSync( filePath, constants.X_OK ) - return true - } catch { - return false - } -} - -function getBrowserExecutablePath () { - const candidatePaths = [ - process.env.PLAYWRIGHT_BROWSER_PATH, - process.env.CHROME_BIN, - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - '/opt/homebrew/bin/chromium', - ].filter( Boolean ) - - const executablePath = candidatePaths.find( canAccessPath ) - - if ( !executablePath ) { - throw new Error(`No browser executable found. Set PLAYWRIGHT_BROWSER_PATH or CHROME_BIN.`) - } - - return executablePath -} - -function getAvailablePort () { - return new Promise( ( resolve, reject ) => { - const server = net.createServer() - - server.unref() - server.on( 'error', reject ) - server.listen( 0, host, () => { - const { port } = server.address() - server.close( err => { - if ( err ) { - reject( err ) - return - } - - resolve( port ) - } ) - } ) - } ) -} - -async function waitForServer ( url, { - intervalMs = 250, - timeoutMs = 60 * 1000 -} = {} ) { - const startedAt = Date.now() - - while ( Date.now() - startedAt < timeoutMs ) { - try { - const response = await fetch( url ) - - if ( response.ok ) { - return - } - } catch {} - - await new Promise( resolve => setTimeout( resolve, intervalMs ) ) - } - - throw new Error(`Timed out waiting for dev server at ${ url }`) -} - -function stopProcess ( childProcess ) { - return new Promise( resolve => { - if ( !childProcess ) { - resolve() - return - } - - if ( childProcess.killed || childProcess.exitCode !== null ) { - resolve() - return - } - - childProcess.once( 'exit', () => resolve() ) - childProcess.kill( 'SIGTERM' ) - - setTimeout( () => { - if ( childProcess.exitCode === null ) { - childProcess.kill( 'SIGKILL' ) - } - }, 5 * 1000 ).unref() - } ) -} - -describe('Pagefind dev search', () => { - let browser - let devServer - let devServerOutput = '' - let baseUrl = '' - - beforeAll( async () => { - const executablePath = getBrowserExecutablePath() - if ( configuredBaseUrl.length > 0 ) { - baseUrl = configuredBaseUrl - } else { - const port = await getAvailablePort() - - baseUrl = `http://${ host }:${ port }` - - devServer = spawn( command, [ - 'exec', - 'astro', - 'dev', - '--host', - host, - '--port', - String( port ) - ], { - cwd: process.cwd(), - env: { - ...process.env, - PUBLIC_SEARCH_PROVIDER: 'pagefind' - }, - stdio: [ 'ignore', 'pipe', 'pipe' ] - } ) - - devServer.stdout.on( 'data', chunk => { - devServerOutput += chunk.toString() - } ) - devServer.stderr.on( 'data', chunk => { - devServerOutput += chunk.toString() - } ) - } - - await waitForServer( baseUrl ) - - browser = await chromium.launch({ - executablePath, - headless: true - } ) - } ) - - afterAll( async () => { - await browser?.close() - await stopProcess( devServer ) - } ) - - it('renders visible Pagefind results when Native Support is clicked', async () => { - const page = await browser.newPage() - const consoleErrors = [] - const pageErrors = [] - const pagefindResponses = [] - const failedRequests = [] - let fragmentRequests = 0 - let failedFragmentRequests = 0 - - page.on( 'console', message => { - if ( message.type() === 'error' ) { - consoleErrors.push( message.text() ) - } - } ) - - page.on( 'pageerror', error => { - pageErrors.push( error.message ) - } ) - - page.on( 'response', response => { - if ( response.url().includes( '/pagefind/pagefind.js' ) ) { - pagefindResponses.push({ - status: response.status(), - url: response.url() - }) - } - } ) - - page.on( 'request', request => { - if ( request.url().includes( '/pagefind/' ) && request.url().includes( 'pf_fragment' ) ) { - fragmentRequests++ - } - } ) - - page.on( 'requestfailed', request => { - if ( request.url().includes( '/pagefind/pagefind.js' ) ) { - failedRequests.push({ - errorText: request.failure()?.errorText || 'unknown', - url: request.url() - }) - } - - if ( request.url().includes( '/pagefind/' ) && request.url().includes( 'pf_fragment' ) ) { - failedFragmentRequests++ - } - } ) - - await page.goto( baseUrl, { - waitUntil: 'domcontentloaded' - } ) - - await page.waitForTimeout( 3000 ) - - await Promise.all([ - page.waitForResponse( response => { - return response.url().includes( '/pagefind/pagefind.js' ) - }, { - timeout: 10 * 1000 - } ), - page.getByRole( 'button', { - name: /native support/i - } ).click() - ]) - - await page.waitForFunction( () => { - return [ ...document.querySelectorAll( 'li[data-app-slug] h3' ) ].some( node => { - const text = node.textContent || '' - return text.trim().length > 0 && !/loading/i.test( text ) - } ) - }, { - timeout: 15 * 1000 - } ) - - const bodyText = await page.locator( 'body' ).textContent() - const renderedResults = await page.evaluate( () => { - const headings = [ ...document.querySelectorAll( 'li[data-app-slug] h3' ) ].map( node => { - return ( node.textContent || '' ).trim() - } ) - - return { - loadingRows: headings.filter( text => /loading/i.test( text ) ).length, - rows: document.querySelectorAll( 'li[data-app-slug]' ).length, - visibleHeadings: headings.slice( 0, 5 ) - } - } ) - - expect( pagefindResponses.some( response => response.status === 200 ), devServerOutput ).toBe( true ) - expect( - pagefindResponses.some( response => response.status >= 400 ), - [ - pagefindResponses.map( response => `${ response.status } ${ response.url }` ).join( '\n' ), - failedRequests.map( request => `${ request.errorText } ${ request.url }` ).join( '\n' ), - pageErrors.join( '\n' ), - consoleErrors.join( '\n' ) - ].join( '\n\n' ) - ).toBe( false ) - expect( fragmentRequests, JSON.stringify( renderedResults ) ).toBeGreaterThan( 0 ) - expect( fragmentRequests, JSON.stringify( renderedResults ) ).toBeLessThan( 100 ) - expect( failedFragmentRequests, JSON.stringify( renderedResults ) ).toBe( 0 ) - expect( renderedResults.rows, JSON.stringify( renderedResults ) ).toBeGreaterThan( 0 ) - expect( renderedResults.loadingRows, JSON.stringify( renderedResults ) ).toBe( 0 ) - expect( bodyText ).not.toContain( 'Failed to load url /pagefind/pagefind.js' ) - expect( bodyText ).not.toContain( 'No apps found for' ) - expect( pageErrors.join( '\n' ) ).not.toMatch( /pagefind\/pagefind\.js/ ) - expect( pageErrors.join( '\n' ) ).not.toMatch( /Failed to fetch/ ) - expect( consoleErrors.join( '\n' ) ).not.toMatch( /pagefind\/pagefind\.js/ ) - expect( consoleErrors.join( '\n' ) ).not.toMatch( /ERR_INSUFFICIENT_RESOURCES/ ) - - await page.close() - } ) -} ) diff --git a/test/playwright/pagefind-native-filter.playwright.ts b/test/playwright/pagefind-native-filter.playwright.ts new file mode 100644 index 0000000..face037 --- /dev/null +++ b/test/playwright/pagefind-native-filter.playwright.ts @@ -0,0 +1,149 @@ +import type { Browser } from 'playwright-core' +import { + afterAll, + beforeAll, + describe, + expect, + it +} from 'vitest' + +import { + launchBrowser, + startAstroDevServer, + stopChildProcess, + type AstroDevServer +} from './support/astro-browser-test' + +describe( 'Pagefind dev search', () => { + let browser: Browser + let devServer: AstroDevServer + + beforeAll( async () => { + devServer = await startAstroDevServer({ + env: { + PUBLIC_SEARCH_PROVIDER: 'pagefind' + } + }) + + browser = await launchBrowser() + } ) + + afterAll( async () => { + await browser?.close() + await stopChildProcess( devServer?.process || null ) + } ) + + it( 'renders visible Pagefind results when Native Support is clicked', async () => { + const page = await browser.newPage() + const consoleErrors: string[] = [] + const pageErrors: string[] = [] + const pagefindResponses: Array<{ + status: number + url: string + }> = [] + const failedRequests: Array<{ + errorText: string + url: string + }> = [] + let fragmentRequests = 0 + let failedFragmentRequests = 0 + + page.on( 'console', message => { + if ( message.type() === 'error' ) { + consoleErrors.push( message.text() ) + } + } ) + + page.on( 'pageerror', error => { + pageErrors.push( error.message ) + } ) + + page.on( 'response', response => { + if ( response.url().includes( '/pagefind/pagefind.js' ) ) { + pagefindResponses.push({ + status: response.status(), + url: response.url() + }) + } + } ) + + page.on( 'request', request => { + if ( request.url().includes( '/pagefind/' ) && request.url().includes( 'pf_fragment' ) ) { + fragmentRequests++ + } + } ) + + page.on( 'requestfailed', request => { + if ( request.url().includes( '/pagefind/pagefind.js' ) ) { + failedRequests.push({ + errorText: request.failure()?.errorText || 'unknown', + url: request.url() + }) + } + + if ( request.url().includes( '/pagefind/' ) && request.url().includes( 'pf_fragment' ) ) { + failedFragmentRequests++ + } + } ) + + await page.goto( devServer.baseUrl, { + waitUntil: 'domcontentloaded' + } ) + + await page.waitForTimeout( 3000 ) + + await Promise.all([ + page.waitForResponse( response => { + return response.url().includes( '/pagefind/pagefind.js' ) + }, { + timeout: 10 * 1000 + } ), + page.getByRole( 'button', { + name: /native support/i + } ).click() + ]) + + await page.waitForFunction( () => { + return [ ...document.querySelectorAll( 'li[data-app-slug] h3' ) ].some( node => { + const text = node.textContent || '' + return text.trim().length > 0 && !/loading/i.test( text ) + } ) + }, { + timeout: 15 * 1000 + } ) + + const bodyText = await page.locator( 'body' ).textContent() + const renderedResults = await page.evaluate( () => { + const headings = [ ...document.querySelectorAll( 'li[data-app-slug] h3' ) ].map( node => { + return ( node.textContent || '' ).trim() + } ) + + return { + loadingRows: headings.filter( text => /loading/i.test( text ) ).length, + rows: document.querySelectorAll( 'li[data-app-slug]' ).length, + visibleHeadings: headings.slice( 0, 5 ) + } + } ) + + expect( pagefindResponses.some( response => response.status === 200 ), devServer.output.text ).toBe( true ) + expect( + pagefindResponses.some( response => response.status >= 400 ), + [ + pagefindResponses.map( response => `${ response.status } ${ response.url }` ).join( '\n' ), + failedRequests.map( request => `${ request.errorText } ${ request.url }` ).join( '\n' ), + pageErrors.join( '\n' ), + consoleErrors.join( '\n' ) + ].join( '\n\n' ) + ).toBe( false ) + expect( fragmentRequests, JSON.stringify( renderedResults ) ).toBeGreaterThan( 0 ) + expect( fragmentRequests, JSON.stringify( renderedResults ) ).toBeLessThan( 100 ) + expect( failedFragmentRequests, JSON.stringify( renderedResults ) ).toBe( 0 ) + expect( renderedResults.rows, bodyText ).toBeGreaterThan( 0 ) + expect( renderedResults.loadingRows, JSON.stringify( renderedResults ) ).toBe( 0 ) + expect( bodyText?.includes( 'No results found' ) ?? false, JSON.stringify( renderedResults ) ).toBe( false ) + expect( consoleErrors, devServer.output.text ).toEqual( [] ) + expect( pageErrors, devServer.output.text ).toEqual( [] ) + + await page.close() + } ) +} ) diff --git a/test/playwright/support/app-archive-fixture.ts b/test/playwright/support/app-archive-fixture.ts new file mode 100644 index 0000000..14b132a --- /dev/null +++ b/test/playwright/support/app-archive-fixture.ts @@ -0,0 +1,76 @@ +import { mkdtemp, mkdir, readFile, rm, writeFile } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { join } from 'node:path' + +import { Zip } from 'zip-lib' + +const machoObjectBase64 = + 'z/rt/gwAAAEAAAAAAQAAAAQAAABoAQAAACAAAAAAAAAZAAAA6AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgAAAAAAAAAiAEAAAAAAAA4AAAAAAAAAAcAAAAHAAAAAgAAAAAAAABfX3RleHQAAAAAAAAAAAAAX19URVhUAAAAAAAAAAAAAAAAAAAAAAAAFAAAAAAAAACIAQAAAgAAAAAAAAAAAAAAAAQAgAAAAAAAAAAAAAAAAF9fY29tcGFjdF91bndpbmRfX0xEAAAAAAAAAAAAAAAAGAAAAAAAAAAgAAAAAAAAAKABAAADAAAAwAEAAAEAAAAAAAACAAAAAAAAAAAAAAAAMgAAABgAAAABAAAAAAALAAACGgAAAAAAAgAAABgAAADIAQAAAwAAAPgBAAAYAAAACwAAAFAAAAAAAAAAAgAAAAIAAAABAAAAAwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/QwDRAACAUv8PALn/QwCRwANf1gAAAAAAAAAAAAAAABQAAAAAEAACAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAGDQAAAA4BAAAAAAAAAAAAAAcAAAAOAgAAGAAAAAAAAAABAAAADwEAAAAAAAAAAAAAAF9tYWluAGx0bXAxAGx0bXAwAAAAAAAA' + +export interface PlaywrightUploadFile { + arrayBuffer: ArrayBuffer + buffer: Buffer + mimeType: string + name: string + type: string +} + +function makeInfoPlist ( appName: string ) { + return [ + '', + '', + '', + '', + ' CFBundleDisplayName', + ` ${ appName }`, + ' CFBundleExecutable', + ` ${ appName }`, + ' CFBundleIdentifier', + ` com.doesitarm.${ appName.toLowerCase().replaceAll( ' ', '-' ) }`, + ' CFBundleName', + ` ${ appName }`, + ' CFBundleShortVersionString', + ' 1.0.0', + '', + '', + '' + ].join( '\n' ) +} + +export async function createNativeAppArchive ( appName = 'Playwright Native App' ): Promise { + const tempRoot = await mkdtemp( join( tmpdir(), 'doesitarm-playwright-' ) ) + const appBundlePath = join( tempRoot, `${ appName }.app` ) + const contentsPath = join( appBundlePath, 'Contents' ) + const executablePath = join( contentsPath, 'MacOS', appName ) + const archivePath = join( tempRoot, `${ appName }.app.zip` ) + + try { + const executableBytes = new Uint8Array( Buffer.from( machoObjectBase64, 'base64' ) ) + + await mkdir( join( contentsPath, 'MacOS' ), { recursive: true } ) + await writeFile( join( contentsPath, 'Info.plist' ), makeInfoPlist( appName ) ) + await writeFile( executablePath, executableBytes, { mode: 0o755 } ) + + const zip = new Zip() + + zip.addFolder( appBundlePath, `${ appName }.app` ) + await zip.archive( archivePath ) + + const archiveBuffer = await readFile( archivePath ) + + const archiveArrayBuffer = new Uint8Array( archiveBuffer ).slice().buffer + + return { + arrayBuffer: archiveArrayBuffer, + buffer: archiveBuffer, + mimeType: 'application/zip', + name: `${ appName }.app.zip`, + type: 'application/zip' + } + } finally { + await rm( tempRoot, { + force: true, + recursive: true + } ) + } +} diff --git a/test/playwright/support/astro-browser-test.ts b/test/playwright/support/astro-browser-test.ts new file mode 100644 index 0000000..af3b2d1 --- /dev/null +++ b/test/playwright/support/astro-browser-test.ts @@ -0,0 +1,176 @@ +import { spawn, type ChildProcessWithoutNullStreams } from 'node:child_process' +import { accessSync, constants } from 'node:fs' +import net from 'node:net' + +import { chromium, type Browser } from 'playwright-core' + +const command = process.platform === 'win32' ? 'pnpm.cmd' : 'pnpm' +const host = '127.0.0.1' + +export interface AstroDevServer { + baseUrl: string + output: { + text: string + } + process: ChildProcessWithoutNullStreams | null +} + +function canAccessPath ( filePath: string ) { + try { + accessSync( filePath, constants.X_OK ) + return true + } catch { + return false + } +} + +export function getBrowserExecutablePath () { + const candidatePaths = [ + process.env.PLAYWRIGHT_BROWSER_PATH, + process.env.CHROME_BIN, + '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + '/Applications/Chromium.app/Contents/MacOS/Chromium', + '/opt/homebrew/bin/chromium', + ].filter( ( value ): value is string => Boolean( value ) ) + + const executablePath = candidatePaths.find( canAccessPath ) + + if ( !executablePath ) { + throw new Error( 'No browser executable found. Set PLAYWRIGHT_BROWSER_PATH or CHROME_BIN.' ) + } + + return executablePath +} + +function getAvailablePort () { + return new Promise( ( resolve, reject ) => { + const server = net.createServer() + + server.unref() + server.on( 'error', reject ) + server.listen( 0, host, () => { + const address = server.address() + + if ( !address || typeof address === 'string' ) { + reject( new Error( 'Unable to determine a free port.' ) ) + return + } + + server.close( err => { + if ( err ) { + reject( err ) + return + } + + resolve( address.port ) + } ) + } ) + } ) +} + +export async function waitForServer ( url: string, { + intervalMs = 250, + timeoutMs = 60 * 1000 +} = {} ) { + const startedAt = Date.now() + + while ( Date.now() - startedAt < timeoutMs ) { + try { + const response = await fetch( url ) + + if ( response.ok ) { + return + } + } catch {} + + await new Promise( resolve => setTimeout( resolve, intervalMs ) ) + } + + throw new Error( `Timed out waiting for dev server at ${ url }` ) +} + +export async function startAstroDevServer ( { + env = {}, + preferConfiguredBaseUrl = true +}: { + env?: Record + preferConfiguredBaseUrl?: boolean +} = {} ): Promise { + const configuredBaseUrl = process.env.PLAYWRIGHT_BASE_URL || '' + + if ( preferConfiguredBaseUrl && configuredBaseUrl.length > 0 ) { + await waitForServer( configuredBaseUrl ) + + return { + baseUrl: configuredBaseUrl, + output: { text: '' }, + process: null + } + } + + const port = await getAvailablePort() + const baseUrl = `http://${ host }:${ port }` + const output = { text: '' } + + const childProcess = spawn( command, [ + 'exec', + 'astro', + 'dev', + '--host', + host, + '--port', + String( port ) + ], { + cwd: process.cwd(), + env: { + ...process.env, + ...env + }, + stdio: [ 'ignore', 'pipe', 'pipe' ] + } ) + + childProcess.stdout.on( 'data', chunk => { + output.text += chunk.toString() + } ) + childProcess.stderr.on( 'data', chunk => { + output.text += chunk.toString() + } ) + + await waitForServer( baseUrl ) + + return { + baseUrl, + output, + process: childProcess + } +} + +export function stopChildProcess ( childProcess: ChildProcessWithoutNullStreams | null ) { + return new Promise( resolve => { + if ( !childProcess ) { + resolve() + return + } + + if ( childProcess.killed || childProcess.exitCode !== null ) { + resolve() + return + } + + childProcess.once( 'exit', () => resolve() ) + childProcess.kill( 'SIGTERM' ) + + setTimeout( () => { + if ( childProcess.exitCode === null ) { + childProcess.kill( 'SIGKILL' ) + } + }, 5 * 1000 ).unref() + } ) +} + +export async function launchBrowser (): Promise { + return chromium.launch({ + executablePath: getBrowserExecutablePath(), + headless: true + }) +} diff --git a/test/prebuild/load-sitemap-endpoints.test.ts b/test/prebuild/load-sitemap-endpoints.test.ts new file mode 100644 index 0000000..48c4f4f --- /dev/null +++ b/test/prebuild/load-sitemap-endpoints.test.ts @@ -0,0 +1,94 @@ +import { + beforeEach, + describe, + expect, + it, + vi +} from 'vitest' + +import axios from 'axios' + +import { + fetchJsonWithRetries, + shouldRetryError +} from '~/helpers/pagefind/load-sitemap-endpoints' + +vi.mock( 'axios', () => { + return { + default: { + get: vi.fn() + } + } +} ) + +describe( 'load sitemap endpoints helper', () => { + beforeEach( () => { + vi.mocked( axios.get ).mockReset() + } ) + + it( 'retries transient 5xx errors and eventually resolves', async () => { + const axiosGet = vi.mocked( axios.get ) + + axiosGet + .mockRejectedValueOnce({ + response: { + status: 502 + } + }) + .mockResolvedValueOnce({ + data: { + ok: true + } + } ) + + const data = await fetchJsonWithRetries( 'https://api.doesitarm.com/sitemap-endpoints.json', { + attempts: 2, + delayMs: 0 + } ) + + expect( data ).toEqual({ + ok: true + } ) + expect( axiosGet ).toHaveBeenCalledTimes( 2 ) + } ) + + it( 'does not retry non-5xx errors', async () => { + const axiosGet = vi.mocked( axios.get ) + + axiosGet.mockRejectedValueOnce({ + response: { + status: 404 + } + }) + + await expect( fetchJsonWithRetries( 'https://api.doesitarm.com/sitemap-endpoints.json', { + attempts: 3, + delayMs: 0 + } ) ).rejects.toEqual({ + response: { + status: 404 + } + }) + + expect( axiosGet ).toHaveBeenCalledTimes( 1 ) + } ) + + it( 'classifies retryable server errors', () => { + expect( shouldRetryError( { + response: { + status: 502 + } + } ) ).toBe( true ) + expect( shouldRetryError( { + response: { + status: 503 + } + } ) ).toBe( true ) + expect( shouldRetryError( { + response: { + status: 404 + } + } ) ).toBe( false ) + expect( shouldRetryError( new Error( 'network' ) ) ).toBe( false ) + } ) +} ) diff --git a/test/scanner/client.test.ts b/test/scanner/client.test.ts new file mode 100644 index 0000000..57894b9 --- /dev/null +++ b/test/scanner/client.test.ts @@ -0,0 +1,48 @@ +import { + describe, + expect, + it +} from 'vitest' +import '@vitest/web-worker' + +import { runScanWorker } from '~/helpers/scanner/client' + +import { createNativeAppArchive } from '../playwright/support/app-archive-fixture' + +describe( 'scanner worker client', () => { + it( 'extracts app metadata from a zipped native app fixture', async () => { + const progressMessages: string[] = [] + const archiveFile = await createNativeAppArchive() + + const { scan } = await runScanWorker( archiveFile, details => { + if ( typeof details.message === 'string' ) { + progressMessages.push( details.message ) + } + } ) + + expect( progressMessages ).toContain( 'ℹ️ Found Info.plist' ) + expect( scan.status ).toBe( 'finished' ) + expect( scan.displayName ).toBe( 'Playwright Native App' ) + expect( scan.appVersion ).toBe( '1.0.0' ) + expect( scan.binarySupportsNative ).toBe( true ) + expect( scan.displayBinarySize.length ).toBeGreaterThan( 0 ) + expect( scan.details ).toEqual( expect.arrayContaining( [ + expect.objectContaining({ + label: 'Bundle Identifier', + value: 'com.doesitarm.playwright-native-app' + }), + expect.objectContaining({ + label: 'Version', + value: '1.0.0' + }) + ] ) ) + expect( scan.info.filename ).toBe( 'Playwright Native App.app.zip' ) + expect( scan.info.result ).toBe( '✅' ) + expect( scan.info.infoPlist.CFBundleIdentifier ).toBe( 'com.doesitarm.playwright-native-app' ) + expect( scan.info.machoMeta.architectures ).toEqual( expect.arrayContaining( [ + expect.objectContaining({ + processorType: 'ARM64' + }) + ] ) ) + } ) +} ) diff --git a/test/scanner/file-api.test.ts b/test/scanner/file-api.test.ts new file mode 100644 index 0000000..903b9a7 --- /dev/null +++ b/test/scanner/file-api.test.ts @@ -0,0 +1,72 @@ +import { Buffer } from 'buffer' + +import { + describe, + expect, + it +} from 'vitest' + +import { + File, + FileReader, + type FileReaderLoadEvent +} from '~/helpers/scanner/file-api' + +describe( 'scanner file api shim', () => { + it( 'constructs a file from a buffer payload', () => { + const file = new File({ + buffer: Buffer.from( 'hello world', 'utf8' ), + name: 'hello.txt', + type: 'text/plain' + }) + + expect( file.name ).toBe( 'hello.txt' ) + expect( file.type ).toBe( 'text/plain' ) + expect( file.size ).toBe( 11 ) + } ) + + it( 'reads text content through the node FileReader shim', async () => { + const file = new File({ + buffer: Buffer.from( 'scanner-text', 'utf8' ), + name: 'scanner.txt', + type: 'text/plain' + }) + const reader = new FileReader() + + const loadedText = await new Promise( ( resolve, reject ) => { + reader.onerror = reject + reader.onload = event => { + const loadEvent = event as FileReaderLoadEvent + + resolve( String( loadEvent.target.result ) ) + } + + reader.readAsText( file ) + } ) + + expect( loadedText ).toBe( 'scanner-text' ) + } ) + + it( 'reads binary content through the node FileReader shim', async () => { + const file = new File({ + buffer: Buffer.from( [ 0xde, 0xad, 0xbe, 0xef ] ), + name: 'scanner.bin', + type: 'application/octet-stream' + }) + const reader = new FileReader() + + const loadedBuffer = await new Promise( ( resolve, reject ) => { + reader.onerror = reject + reader.onload = event => { + const loadEvent = event as FileReaderLoadEvent + + resolve( loadEvent.target.nodeBufferResult ) + } + + reader.readAsArrayBuffer( file ) + } ) + + expect( Buffer.isBuffer( loadedBuffer ) ).toBe( true ) + expect( loadedBuffer.toString( 'hex' ) ).toBe( 'deadbeef' ) + } ) +} ) diff --git a/test/scanner/plist.test.ts b/test/scanner/plist.test.ts new file mode 100644 index 0000000..9fe2f5e --- /dev/null +++ b/test/scanner/plist.test.ts @@ -0,0 +1,52 @@ +import { Buffer } from 'buffer' + +import { + describe, + expect, + it, + vi +} from 'vitest' + +import { + parseFileSync, + parsePlistBuffer +} from '~/helpers/scanner/parsers/plist-parser' + +type ParsedPlist = Record + +const xmlPlist = Buffer.from( [ + '', + '', + '', + '', + ' CFBundleExecutable', + ' Playwright Native App', + ' CFBundleIdentifier', + ' com.doesitarm.playwright-native-app', + '', + '' +].join( '\n' ), 'utf8' ) + +describe( 'plist parser', () => { + it( 'parses xml plist buffers asynchronously', async () => { + const callback = vi.fn() + const plist = await parsePlistBuffer( xmlPlist as any, callback ) as ParsedPlist + + expect( plist.CFBundleExecutable ).toBe( 'Playwright Native App' ) + expect( plist.CFBundleIdentifier ).toBe( 'com.doesitarm.playwright-native-app' ) + expect( callback ).toHaveBeenCalledWith( null, plist ) + } ) + + it( 'parses xml plist buffers synchronously', () => { + const plist = parseFileSync( xmlPlist as any ) as ParsedPlist + + expect( plist.CFBundleExecutable ).toBe( 'Playwright Native App' ) + expect( plist.CFBundleIdentifier ).toBe( 'com.doesitarm.playwright-native-app' ) + } ) + + it( 'rejects invalid plist data', async () => { + await expect( parsePlistBuffer( Buffer.from( 'not-a-plist', 'utf8' ) as any ) ) + .rejects + .toThrow( /Invalid binary plist/i ) + } ) +} ) diff --git a/vitest.playwright.config.mjs b/vitest.playwright.config.mjs index 987a9cc..57fb266 100644 --- a/vitest.playwright.config.mjs +++ b/vitest.playwright.config.mjs @@ -9,7 +9,8 @@ const vitestConfig = { test: { setupFiles: 'tsconfig-paths/register', include: [ - 'test/playwright/**/*.playwright.js' + 'test/playwright/**/*.playwright.js', + 'test/playwright/**/*.playwright.ts' ], exclude: [ 'test/_disabled/**'