File: //var/www/quadcode.com/node_modules/super-sitemap/dist/sampled.js
import dirTree from 'directory-tree';
import { XMLParser } from 'fast-xml-parser';
import { filterRoutes } from './sitemap.js';
/**
* Given the URL to this project's sitemap, _which must have been generated by
* Super Sitemap for this to work as designed_, returns an array containing:
* 1. the URL of every static route, and
* 2. one URL for every parameterized route.
*
* ```js
* // Example result:
* [ 'http://localhost:5173/', 'http://localhost:5173/about', 'http://localhost:5173/blog', 'http://localhost:5173/blog/hello-world', 'http://localhost:5173/blog/tag/red' ]
* ```
*
* @public
* @param sitemapUrl - E.g. http://localhost:5173/sitemap.xml
* @returns Array of paths, one for each route; grouped by static, then dynamic; sub-sorted alphabetically.
*
* @remarks
* - This is intended as a utility to gather unique URLs for SEO analysis,
* functional tests for public routes, etc.
* - As a utility, the design favors ease of use for the developer over runtime
* performance, and consequently consumes `/sitemap.xml` directly, to avoid
* the developer needing to recreate and maintain a duplicate sitemap config,
* param values, exclusion rules, etc.
* - LIMITATIONS:
* 1. The result does not include `additionalPaths` from the sitemap config
* b/c it's impossible to identify those by pattern using only the result.
* 2. This does not distinguish between routes that differ only due to a
* pattern matcher–e.g.`/foo/[foo]` and `/foo/[foo=integer]` will evaluated
* as `/foo/[foo]` and one sample URL will be returned.
*/
export async function sampledUrls(sitemapUrl) {
const response = await fetch(sitemapUrl);
const sitemapXml = await response.text();
return await _sampledUrls(sitemapXml);
}
/**
* Given the URL to this project's sitemap, _which must have been generated by
* Super Sitemap for this to work as designed_, returns an array containing:
* 1. the path of every static route, and
* 2. one path for every parameterized route.
*
* ```js
* // Example result:
* [ '/', '/about', '/blog', '/blog/hello-world', '/blog/tag/red' ]
* ```
*
* @public
* @param sitemapUrl - E.g. http://localhost:5173/sitemap.xml
* @returns Array of paths, one for each route; grouped by static, then dynamic; sub-sorted alphabetically.
*
* @remarks
* - This is intended as a utility to gather unique paths for SEO analysis,
* functional tests for public routes, etc.
* - As a utility, the design favors ease of use for the developer over runtime
* performance, and consequently consumes `/sitemap.xml` directly, to avoid
* the developer needing to recreate and maintain a duplicate sitemap config,
* param values, exclusion rules, etc.
* - LIMITATIONS:
* 1. The result does not include `additionalPaths` from the sitemap config
* b/c it's impossible to identify those by pattern using only the result.
* 2. This does not distinguish between routes that differ only due to a
* pattern matcher–e.g.`/foo/[foo]` and `/foo/[foo=integer]` will evaluated
* as `/foo/[foo]` and one sample path will be returned.
*/
export async function sampledPaths(sitemapUrl) {
const response = await fetch(sitemapUrl);
const sitemapXml = await response.text();
return await _sampledPaths(sitemapXml);
}
/**
* Given the body of this site's sitemap.xml, returns an array containing:
* 1. the URL of every static (non-parameterized) route, and
* 2. one URL for every parameterized route.
*
* @private
* @param sitemapXml - The XML string of the sitemap to analyze. This must have
* been created by Super Sitemap to work as designed.
* @returns Array of URLs, sorted alphabetically
*/
export async function _sampledUrls(sitemapXml) {
const parser = new XMLParser();
const sitemap = parser.parse(sitemapXml);
let urls = [];
// If this is a sitemap index, fetch all sub sitemaps and combine their URLs.
// Note: _sampledUrls() is intended to be used by devs within Playwright
// tests. Because of this, we know what host to expect and can replace
// whatever origin the dev set with localhost:4173, which is where Playwright
// serves the app during testing. For unit tests, our mock.js mocks also
// expect this host.
if (sitemap.sitemapindex) {
const subSitemapUrls = sitemap.sitemapindex.sitemap.map((obj) => obj.loc);
for (const url of subSitemapUrls) {
const path = new URL(url).pathname;
const res = await fetch('http://localhost:4173' + path);
const xml = await res.text();
const _sitemap = parser.parse(xml);
const _urls = _sitemap.urlset.url.map((x) => x.loc);
urls.push(..._urls);
}
}
else {
urls = sitemap.urlset.url.map((x) => x.loc);
}
// Can't use this because Playwright doesn't use Vite.
// let routes = Object.keys(import.meta.glob('/src/routes/**/+page.svelte'));
// Read /src/routes to build 'routes'.
let routes = [];
try {
let projDir;
const filePath = import.meta.url.slice(7); // Strip out "file://" protocol
if (filePath.includes('node_modules')) {
// Currently running as an npm package.
projDir = filePath.split('node_modules')[0];
}
else {
// Currently running unit tests during dev.
projDir = filePath.split('/src/')[0];
projDir += '/';
}
const dirTreeRes = dirTree(projDir + 'src/routes');
routes = extractPaths(dirTreeRes);
// Match +page.svelte or +page@.svelte (used to break out of a layout).
//https://kit.svelte.dev/docs/advanced-routing#advanced-layouts-breaking-out-of-layouts
routes = routes.filter((route) => route.match(/\+page.*\.svelte$/));
// 1. Trim everything to left of '/src/routes/' so it starts with
// `src/routes/` as `filterRoutes()` expects.
// 2. Remove all grouping segments. i.e. those starting with '(' and ending
// with ')'
const i = routes[0].indexOf('/src/routes/');
const regex = /\/\([^)]+\)/g;
routes = routes.map((route) => route.slice(i).replace(regex, ''));
}
catch (err) {
console.error('An error occurred:', err);
}
// Filter to reformat from file paths into site paths. The 2nd arg for
// excludePatterns is empty the exclusion pattern was already applied during
// generation of the sitemap.
routes = filterRoutes(routes, []);
// Remove any optional `/[[lang]]` prefix. We can just use the default language that
// will not have this stem, for the purposes of this sampling. But ensure root
// becomes '/', not an empty string.
routes = routes.map((route) => {
return route.replace(/\/?\[\[lang(=[a-z]+)?\]\]/, '') || '/';
});
// Separate static and dynamic routes. Remember these are _routes_ from disk
// and consequently have not had any exclusion patterns applied against them,
// they could contain `/about`, `/blog/[slug]`, routes that will need to be
// excluded like `/dashboard`.
const nonExcludedStaticRoutes = [];
const nonExcludedDynamicRoutes = [];
for (const route of routes) {
if (/\[.*\]/.test(route)) {
nonExcludedDynamicRoutes.push(route);
}
else {
nonExcludedStaticRoutes.push(route);
}
}
const ORIGIN = new URL(urls[0]).origin;
const nonExcludedStaticRouteUrls = new Set(nonExcludedStaticRoutes.map((path) => ORIGIN + path));
// Using URLs as the source, separate into static and dynamic routes. This:
// 1. Gather URLs that are static routes. We cannot use staticRoutes items
// directly because it is generated from reading `/src/routes` and has not
// had the dev's `excludePatterns` applied so an excluded routes like
// `/dashboard` could exist within in, but _won't_ in the sitemap URLs.
// 2. Removing static routes from the sitemap URLs before sampling for
// dynamic paths is necessary due to SvelteKit's route specificity rules.
// E.g. we remove paths like `/about` so they aren't sampled as a match for
// a dynamic route like `/[foo]`.
const dynamicRouteUrls = [];
const staticRouteUrls = [];
for (const url of urls) {
if (nonExcludedStaticRouteUrls.has(url)) {
staticRouteUrls.push(url);
}
else {
dynamicRouteUrls.push(url);
}
}
// Convert dynamic route patterns into regex patterns.
// - Use Set to make unique. Duplicates may occur given we haven't applied
// excludePatterns to the dynamic **routes** (e.g. `/blog/[page=integer]`
// and `/blog/[slug]` both become `/blog/[^/]+`). When we sample URLs for
// each of these patterns, however the excluded patterns won't exist in the
// URLs from the sitemap, so it's not a problem.
// - ORIGIN is required, otherwise a false match can be found when one pattern
// is a subset of a another. Merely terminating with "$" is not sufficient
// an overlapping subset may still be found from the end.
const regexPatterns = new Set(nonExcludedDynamicRoutes.map((path) => {
const regexPattern = path.replace(/\[[^\]]+\]/g, '[^/]+');
return ORIGIN + regexPattern + '$';
}));
// Gather a max of one URL for each dynamic route's regex pattern.
// - Remember, a regex pattern may exist in these routes that was excluded by
// the exclusionPatterns when the sitemap was generated. This is OK because
// no URLs will exist to be matched with them.
const sampledDynamicUrls = findFirstMatches(regexPatterns, dynamicRouteUrls);
return [...staticRouteUrls.sort(), ...Array.from(sampledDynamicUrls).sort()];
}
/**
* Given the body of this site's sitemap.xml, returns an array containing:
* 1. the path of every static (non-parameterized) route, and
* 2. one path for every parameterized route.
*
* @private
* @param sitemapXml - The XML string of the sitemap to analyze. This must have
* been created by Super Sitemap to work as designed.
* @returns Array of paths, sorted alphabetically
*/
export async function _sampledPaths(sitemapXml) {
const urls = await _sampledUrls(sitemapXml);
return urls.map((url) => new URL(url).pathname);
}
/**
* Given a set of strings, return the first matching string for every regex
* within a set of regex patterns. It is possible and allowed for no match to be
* found for a given regex.
*
* @private
* @param regexPatterns - Set of regex patterns to search for.
* @param haystack - Array of strings to search within.
* @returns Set of strings where each is the first match found for a pattern.
*
* @example
* ```ts
* const patterns = new Set(["a.*", "b.*"]);
* const haystack = ["apple", "banana", "cherry"];
* const result = findFirstMatches(patterns, haystack); // Set { 'apple', 'banana' }
* ```
*/
export function findFirstMatches(regexPatterns, haystack) {
const firstMatches = new Set();
for (const pattern of regexPatterns) {
const regex = new RegExp(pattern);
for (const needle of haystack) {
if (regex.test(needle)) {
firstMatches.add(needle);
break;
}
}
}
return firstMatches;
}
/**
* Extracts the paths from a dirTree response and returns an array of strings
* representing full disk paths to each route and directory.
* - This needs to be filtered to remove items that do not end in `+page.svelte`
* in order to represent routes; we do that outside of this function given
* this is recursive.
*
* @param obj - The dirTree response object. https://www.npmjs.com/package/directory-tree
* @param paths - Array of existing paths to append to (leave unspecified; used
* for recursion)
* @returns An array of strings representing disk paths to each route.
*/
export function extractPaths(obj, paths = []) {
if (obj.path) {
paths.push(obj.path);
}
if (Array.isArray(obj.children)) {
for (const child of obj.children) {
extractPaths(child, paths);
}
}
return paths;
}