diff --git a/helpers/api/sitemap/parse.js b/helpers/api/sitemap/parse.js new file mode 100644 index 0000000..09de3a3 --- /dev/null +++ b/helpers/api/sitemap/parse.js @@ -0,0 +1,73 @@ +import path from 'path' +import fs from 'fs-extra' +import axios from 'axios' +import { parse } from 'fast-xml-parser' + +import { + sitemapLocation, + sitemapIndexFileName, +} from '~/helpers/constants.js' + +const sitemapFilesToTry = [ + sitemapIndexFileName, + 'sitemap.xml' +] + +export function parseSitemapXml ( sitemapXml ) { + // Get URLs from index + const sitemapRoot = parse( sitemapXml ) + + const { + sitemapindex = null, + urlset = null, + } = sitemapRoot + + + if ( sitemapindex !== null ) { + const { + sitemap + } = sitemapindex + + const urlEntries = Array.isArray( sitemap ) ? sitemap : [ sitemap ] + + return urlEntries + } + + // console.log( 'sitemapRoot', sitemapRoot ) + + return urlset.url +} + +export async function getAllUrlsFromLocalSitemap ( sitemapPath ) { + // Get intial sitemap + const sitemapXml = await fs.readFile( sitemapPath, 'utf8' ) + const sitemapDirectory = path.dirname( sitemapPath ) + + // Get URLs from index + const urlEntries = parseSitemapXml( sitemapXml ) + + // Check if url entries are sitemaps + const isSitemapIndex = !!urlEntries[0].loc && urlEntries[0].loc.includes('.xml') + + if ( !isSitemapIndex ) return urlEntries + + + // Get urls from our sitemap + const sitemaps = await Promise.all( urlEntries.map( async entry => { + // Build Sitemap Index URL + const sitemapUrl = new URL( entry.loc ) + + const childSitemapPath = path.join( sitemapDirectory, sitemapUrl.pathname ) + + return await getAllUrlsFromLocalSitemap( childSitemapPath ) + })) + + // Flatten array + return sitemaps.flat() +} + +export async function fetchParsedSitemapXmlForDomain ( domain ) { + for ( const sitemapFile of sitemapFilesToTry ) { + + } +} diff --git a/scripts/download-sitemaps.js b/scripts/download-sitemaps.js index f21d2c3..cc2fca9 100644 --- a/scripts/download-sitemaps.js +++ b/scripts/download-sitemaps.js @@ -1,13 +1,13 @@ import fs from 'fs-extra' import 'dotenv/config' import axios from 'axios' -import { parse } from 'fast-xml-parser' import { sitemapLocation, sitemapIndexFileName, } from '~/helpers/constants.js' +import { parseSitemapXml } from '~/helpers/api/sitemap/parse.js' ;(async () => { @@ -22,14 +22,7 @@ import { const sitemapIndexFilePath = `${ sitemapLocation }${ sitemapIndexFileName }` await fs.writeFile( sitemapIndexFilePath, sitemapIndexXML ) - // Get URLs from index - const { sitemapindex } = parse( sitemapIndexXML ) - - const { - sitemap - } = sitemapindex - - const urlEntries = Array.isArray( sitemap ) ? sitemap : [ sitemap ] + const urlEntries = parseSitemapXml( sitemapIndexXML ) // Fetch each sitemap @@ -44,6 +37,10 @@ import { // Fetch Sitemap Index const sitemapXML = await axios.get( apiSitemapUrl.href ).then( response => response.data ) + // const sitemap = parse( sitemapXML ) + + // console.log( 'sitemap', sitemap ) + // console.log( 'apiSitemapUrl', apiSitemapUrl ) const sitemapFileName = apiSitemapUrl.pathname.split('/')[1]