From 50af45d99cf2f57f5685b52615cda5b37576eab6 Mon Sep 17 00:00:00 2001 From: Sam Carlton Date: Mon, 13 Jun 2022 13:22:44 -0500 Subject: [PATCH] Support parsing remote sitemap indexes --- helpers/api/sitemap/parse.js | 77 ++++++++++++++++++++++++++++++++++-- test/main.js | 12 +++--- 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/helpers/api/sitemap/parse.js b/helpers/api/sitemap/parse.js index 09de3a3..7c88449 100644 --- a/helpers/api/sitemap/parse.js +++ b/helpers/api/sitemap/parse.js @@ -7,12 +7,25 @@ import { sitemapLocation, sitemapIndexFileName, } from '~/helpers/constants.js' +import { isValidHttpUrl } from '~/helpers/check-types.js' const sitemapFilesToTry = [ sitemapIndexFileName, 'sitemap.xml' ] +function getDomainFromString ( urlString ) { + const url = new URL( urlString ) + + return url.hostname +} + +function getPathFromUrl ( urlString ) { + const url = new URL( urlString ) + + return url.pathname +} + export function parseSitemapXml ( sitemapXml ) { // Get URLs from index const sitemapRoot = parse( sitemapXml ) @@ -39,8 +52,23 @@ export function parseSitemapXml ( sitemapXml ) { } export async function getAllUrlsFromLocalSitemap ( sitemapPath ) { + return await getAllUrlsFromSitemap( sitemapPath, { + getMethod: async ( sitemapPath ) => { + const sitemapXml = await fs.readFile( sitemapPath, 'utf8' ) + + return sitemapXml + } + }) +} + +export async function getAllUrlsFromSitemap ( sitemapSource, { getMethod } = {} ) { + + const sitemapPath = isValidHttpUrl( sitemapSource ) ? getPathFromUrl( sitemapSource ) : sitemapSource + + // console.log('sitemapPath', sitemapPath) + // Get intial sitemap - const sitemapXml = await fs.readFile( sitemapPath, 'utf8' ) + const sitemapXml = await getMethod( sitemapSource )// fs.readFile( sitemapPath, 'utf8' ) const sitemapDirectory = path.dirname( sitemapPath ) // Get URLs from index @@ -59,15 +87,58 @@ export async function getAllUrlsFromLocalSitemap ( sitemapPath ) { const childSitemapPath = path.join( sitemapDirectory, sitemapUrl.pathname ) - return await getAllUrlsFromLocalSitemap( childSitemapPath ) + return await getAllUrlsFromSitemap( childSitemapPath, { getMethod } ) })) // Flatten array return sitemaps.flat() } -export async function fetchParsedSitemapXmlForDomain ( domain ) { +export async function fetchAllUrlsFromSitemaps ( urlString ) { + // const domain = getDomainFromString( urlString ) + + const allUrls = new Map() + for ( const sitemapFile of sitemapFilesToTry ) { + const sitemapUrl = new URL( sitemapFile, urlString ) + + // console.log( 'sitemapUrl', sitemapUrl.href ) + + // Just do a quich HEAD request to see if the file exists with getting the whole body + const exists = await axios.head( sitemapUrl.href ) + .catch( () => false ) + .then( response => { + // console.log( 'response', response.status ) + return response.status < 300 + } ) + + // console.log( 'exists', exists ) + + // Skip if sitemap doesn't exist + if ( !exists ) continue + + // Fetch the whole sitemap + const urls = await getAllUrlsFromSitemap( sitemapUrl.href , { + getMethod: async sitemapPath => { + const sitemapUrl = new URL( sitemapPath, urlString ) + + const sitemapXml = await axios.get( sitemapUrl.href ) + .then( response => response.data ) + + return sitemapXml + } + }) + + // console.log( 'Sitemap urls', urls ) + + // Set our urls to the map + urls.forEach( tag => allUrls.set( tag.loc, new URL( tag.loc ) ) ) + + // Store sitemap urls to context + // const urlsMap = new Map( urls.map( tag => [ tag.loc, new URL( tag.loc )] ) ) + } + + return allUrls } diff --git a/test/main.js b/test/main.js index c393346..af3bf9a 100644 --- a/test/main.js +++ b/test/main.js @@ -14,7 +14,8 @@ import { import { logArraysDifference } from '~/helpers/array.js' import { parseSitemapXml, - getAllUrlsFromLocalSitemap + getAllUrlsFromLocalSitemap, + fetchAllUrlsFromSitemaps } from '~/helpers/api/sitemap/parse.js' require('dotenv').config() @@ -122,11 +123,12 @@ test('Sitemap mostly matches production', async (t) => { const urlsNotOnLive = new Set() // const newLocalUrls = new Set() - const liveSitemapXml = await axios( 'https://doesitarm.com/sitemap.xml' ).then( response => response.data ) - const liveSitemap = parser.parse( liveSitemapXml ) + const liveSitemapUrls = await fetchAllUrlsFromSitemaps( 'https://doesitarm.com' ) - // Store sitemap urls to context - const liveSitemapUrls = new Map( liveSitemap.urlset.url.map( tag => [ tag.loc, new URL( tag.loc )] ) ) + // Assert that any sitemap urls exist on live + t.assert( liveSitemapUrls.size > 0, 'No sitemap urls found on live.' ) + + // console.log( 'liveSitemapUrls', liveSitemapUrls ) for ( const localUrl of t.context.sitemapUrls ) {