doesitarm/helpers/api/sitemap/parse.js
ThatGuySam d45b587434 Finish axios migration via shared native HTTP helper
Replace all in-scope axios callsites with a new helpers/http.js wrapper over native fetch, including JSON/text GET, JSON POST, HEAD checks, and transient 5xx retry behavior; update all browser, build, script, and proxy API clients to use it; add focused unit tests; and remove axios from package dependencies.

Constraint: Preserve API/build and deployment behavior while lowering transport surface area.

Rejected: inline fetch replacements at each callsite | rejected to avoid inconsistent error/retry semantics.

Confidence: high

Scope-risk: moderate

Directive: Keep helper in place as the transport boundary and update tests when changing request semantics.

Tested: pnpm run -s typecheck, pnpm -s run test-prebuild, pnpm -s run test, pnpm -s run test:browser, pnpm -s run netlify-build, smoke GETs on /apple-silicon-app-test and /apple-silicon-app-test/?version=2

Not-tested: branch/netlify deployment health in CI pipeline after merge
2026-04-06 12:09:16 -05:00

141 lines
3.6 KiB
JavaScript

import path from 'path'
import fs from 'fs-extra'
import { parse } from 'fast-xml-parser'
import {
sitemapLocation,
sitemapIndexFileName,
} from '~/helpers/constants.js'
import { isValidHttpUrl } from '~/helpers/check-types.js'
import {
getText,
headOk
} from '~/helpers/http.js'
const sitemapFilesToTry = [
sitemapIndexFileName,
'sitemap.xml'
]
function getDomainFromString ( urlString ) {
const url = new URL( urlString )
return url.hostname
}
function getPathFromUrl ( urlString ) {
const url = new URL( urlString )
return url.pathname
}
export function parseSitemapXml ( sitemapXml ) {
// Get URLs from index
const sitemapRoot = parse( sitemapXml )
const {
sitemapindex = null,
urlset = null,
} = sitemapRoot
if ( sitemapindex !== null ) {
const {
sitemap
} = sitemapindex
const urlEntries = Array.isArray( sitemap ) ? sitemap : [ sitemap ]
return urlEntries
}
// console.log( 'sitemapRoot', sitemapRoot )
return urlset.url
}
export async function getAllUrlsFromLocalSitemap ( sitemapPath ) {
return await getAllUrlsFromSitemap( sitemapPath, {
getMethod: async ( sitemapPath ) => {
const sitemapXml = await fs.readFile( sitemapPath, 'utf8' )
return sitemapXml
}
})
}
export async function getAllUrlsFromSitemap ( sitemapSource, { getMethod } = {} ) {
const sitemapPath = isValidHttpUrl( sitemapSource ) ? getPathFromUrl( sitemapSource ) : sitemapSource
// console.log('sitemapPath', sitemapPath)
// Get intial sitemap
const sitemapXml = await getMethod( sitemapSource )// fs.readFile( sitemapPath, 'utf8' )
const sitemapDirectory = path.dirname( sitemapPath )
// Get URLs from index
const urlEntries = parseSitemapXml( sitemapXml )
// Check if url entries are sitemaps
const isSitemapIndex = !!urlEntries[0].loc && urlEntries[0].loc.includes('.xml')
if ( !isSitemapIndex ) return urlEntries
// Get urls from our sitemap
const sitemaps = await Promise.all( urlEntries.map( async entry => {
// Build Sitemap Index URL
const sitemapUrl = new URL( entry.loc )
const childSitemapPath = path.join( sitemapDirectory, sitemapUrl.pathname )
return await getAllUrlsFromSitemap( childSitemapPath, { getMethod } )
}))
// Flatten array
return sitemaps.flat()
}
export async function fetchAllUrlsFromSitemaps ( urlString ) {
// const domain = getDomainFromString( urlString )
const allUrls = new Map()
for ( const sitemapFile of sitemapFilesToTry ) {
const sitemapUrl = new URL( sitemapFile, urlString )
// console.log( 'sitemapUrl', sitemapUrl.href )
// Just do a quich HEAD request to see if the file exists with getting the whole body
const exists = await headOk( sitemapUrl.href )
// console.log( 'exists', exists )
// Skip if sitemap doesn't exist
if ( !exists ) continue
// Fetch the whole sitemap
const urls = await getAllUrlsFromSitemap( sitemapUrl.href , {
getMethod: async sitemapPath => {
const sitemapUrl = new URL( sitemapPath, urlString )
const sitemapXml = await getText( sitemapUrl.href )
return sitemapXml
}
})
// console.log( 'Sitemap urls', urls )
// Set our urls to the map
urls.forEach( tag => allUrls.set( tag.loc, new URL( tag.loc ) ) )
// Store sitemap urls to context
// const urlsMap = new Map( urls.map( tag => [ tag.loc, new URL( tag.loc )] ) )
}
return allUrls
}