mirror of
https://github.com/ThatGuySam/doesitarm.git
synced 2026-05-15 06:35:20 -07:00
Support parsing remote sitemap indexes
This commit is contained in:
parent
ee44d3b746
commit
50af45d99c
2 changed files with 81 additions and 8 deletions
|
|
@ -7,12 +7,25 @@ import {
|
||||||
sitemapLocation,
|
sitemapLocation,
|
||||||
sitemapIndexFileName,
|
sitemapIndexFileName,
|
||||||
} from '~/helpers/constants.js'
|
} from '~/helpers/constants.js'
|
||||||
|
import { isValidHttpUrl } from '~/helpers/check-types.js'
|
||||||
|
|
||||||
const sitemapFilesToTry = [
|
const sitemapFilesToTry = [
|
||||||
sitemapIndexFileName,
|
sitemapIndexFileName,
|
||||||
'sitemap.xml'
|
'sitemap.xml'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
function getDomainFromString ( urlString ) {
|
||||||
|
const url = new URL( urlString )
|
||||||
|
|
||||||
|
return url.hostname
|
||||||
|
}
|
||||||
|
|
||||||
|
function getPathFromUrl ( urlString ) {
|
||||||
|
const url = new URL( urlString )
|
||||||
|
|
||||||
|
return url.pathname
|
||||||
|
}
|
||||||
|
|
||||||
export function parseSitemapXml ( sitemapXml ) {
|
export function parseSitemapXml ( sitemapXml ) {
|
||||||
// Get URLs from index
|
// Get URLs from index
|
||||||
const sitemapRoot = parse( sitemapXml )
|
const sitemapRoot = parse( sitemapXml )
|
||||||
|
|
@ -39,8 +52,23 @@ export function parseSitemapXml ( sitemapXml ) {
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getAllUrlsFromLocalSitemap ( sitemapPath ) {
|
export async function getAllUrlsFromLocalSitemap ( sitemapPath ) {
|
||||||
|
return await getAllUrlsFromSitemap( sitemapPath, {
|
||||||
|
getMethod: async ( sitemapPath ) => {
|
||||||
|
const sitemapXml = await fs.readFile( sitemapPath, 'utf8' )
|
||||||
|
|
||||||
|
return sitemapXml
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getAllUrlsFromSitemap ( sitemapSource, { getMethod } = {} ) {
|
||||||
|
|
||||||
|
const sitemapPath = isValidHttpUrl( sitemapSource ) ? getPathFromUrl( sitemapSource ) : sitemapSource
|
||||||
|
|
||||||
|
// console.log('sitemapPath', sitemapPath)
|
||||||
|
|
||||||
// Get intial sitemap
|
// Get intial sitemap
|
||||||
const sitemapXml = await fs.readFile( sitemapPath, 'utf8' )
|
const sitemapXml = await getMethod( sitemapSource )// fs.readFile( sitemapPath, 'utf8' )
|
||||||
const sitemapDirectory = path.dirname( sitemapPath )
|
const sitemapDirectory = path.dirname( sitemapPath )
|
||||||
|
|
||||||
// Get URLs from index
|
// Get URLs from index
|
||||||
|
|
@ -59,15 +87,58 @@ export async function getAllUrlsFromLocalSitemap ( sitemapPath ) {
|
||||||
|
|
||||||
const childSitemapPath = path.join( sitemapDirectory, sitemapUrl.pathname )
|
const childSitemapPath = path.join( sitemapDirectory, sitemapUrl.pathname )
|
||||||
|
|
||||||
return await getAllUrlsFromLocalSitemap( childSitemapPath )
|
return await getAllUrlsFromSitemap( childSitemapPath, { getMethod } )
|
||||||
}))
|
}))
|
||||||
|
|
||||||
// Flatten array
|
// Flatten array
|
||||||
return sitemaps.flat()
|
return sitemaps.flat()
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function fetchParsedSitemapXmlForDomain ( domain ) {
|
export async function fetchAllUrlsFromSitemaps ( urlString ) {
|
||||||
|
// const domain = getDomainFromString( urlString )
|
||||||
|
|
||||||
|
const allUrls = new Map()
|
||||||
|
|
||||||
for ( const sitemapFile of sitemapFilesToTry ) {
|
for ( const sitemapFile of sitemapFilesToTry ) {
|
||||||
|
|
||||||
|
const sitemapUrl = new URL( sitemapFile, urlString )
|
||||||
|
|
||||||
|
// console.log( 'sitemapUrl', sitemapUrl.href )
|
||||||
|
|
||||||
|
// Just do a quich HEAD request to see if the file exists with getting the whole body
|
||||||
|
const exists = await axios.head( sitemapUrl.href )
|
||||||
|
.catch( () => false )
|
||||||
|
.then( response => {
|
||||||
|
// console.log( 'response', response.status )
|
||||||
|
return response.status < 300
|
||||||
|
} )
|
||||||
|
|
||||||
|
// console.log( 'exists', exists )
|
||||||
|
|
||||||
|
// Skip if sitemap doesn't exist
|
||||||
|
if ( !exists ) continue
|
||||||
|
|
||||||
|
// Fetch the whole sitemap
|
||||||
|
const urls = await getAllUrlsFromSitemap( sitemapUrl.href , {
|
||||||
|
getMethod: async sitemapPath => {
|
||||||
|
const sitemapUrl = new URL( sitemapPath, urlString )
|
||||||
|
|
||||||
|
const sitemapXml = await axios.get( sitemapUrl.href )
|
||||||
|
.then( response => response.data )
|
||||||
|
|
||||||
|
return sitemapXml
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// console.log( 'Sitemap urls', urls )
|
||||||
|
|
||||||
|
// Set our urls to the map
|
||||||
|
urls.forEach( tag => allUrls.set( tag.loc, new URL( tag.loc ) ) )
|
||||||
|
|
||||||
|
// Store sitemap urls to context
|
||||||
|
// const urlsMap = new Map( urls.map( tag => [ tag.loc, new URL( tag.loc )] ) )
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return allUrls
|
||||||
}
|
}
|
||||||
|
|
|
||||||
12
test/main.js
12
test/main.js
|
|
@ -14,7 +14,8 @@ import {
|
||||||
import { logArraysDifference } from '~/helpers/array.js'
|
import { logArraysDifference } from '~/helpers/array.js'
|
||||||
import {
|
import {
|
||||||
parseSitemapXml,
|
parseSitemapXml,
|
||||||
getAllUrlsFromLocalSitemap
|
getAllUrlsFromLocalSitemap,
|
||||||
|
fetchAllUrlsFromSitemaps
|
||||||
} from '~/helpers/api/sitemap/parse.js'
|
} from '~/helpers/api/sitemap/parse.js'
|
||||||
|
|
||||||
require('dotenv').config()
|
require('dotenv').config()
|
||||||
|
|
@ -122,11 +123,12 @@ test('Sitemap mostly matches production', async (t) => {
|
||||||
const urlsNotOnLive = new Set()
|
const urlsNotOnLive = new Set()
|
||||||
// const newLocalUrls = new Set()
|
// const newLocalUrls = new Set()
|
||||||
|
|
||||||
const liveSitemapXml = await axios( 'https://doesitarm.com/sitemap.xml' ).then( response => response.data )
|
const liveSitemapUrls = await fetchAllUrlsFromSitemaps( 'https://doesitarm.com' )
|
||||||
const liveSitemap = parser.parse( liveSitemapXml )
|
|
||||||
|
|
||||||
// Store sitemap urls to context
|
// Assert that any sitemap urls exist on live
|
||||||
const liveSitemapUrls = new Map( liveSitemap.urlset.url.map( tag => [ tag.loc, new URL( tag.loc )] ) )
|
t.assert( liveSitemapUrls.size > 0, 'No sitemap urls found on live.' )
|
||||||
|
|
||||||
|
// console.log( 'liveSitemapUrls', liveSitemapUrls )
|
||||||
|
|
||||||
|
|
||||||
for ( const localUrl of t.context.sitemapUrls ) {
|
for ( const localUrl of t.context.sitemapUrls ) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue