Skip to content

Commit

Permalink
test case: sitemap validator - added
Browse files Browse the repository at this point in the history
  • Loading branch information
darsan-in committed Nov 29, 2024
1 parent f479576 commit f26510e
Show file tree
Hide file tree
Showing 9 changed files with 245 additions and 144 deletions.
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,12 @@
"babel-jest": "29.7.0",
"commander": "^12.1.0",
"eslint": "^9.15.0",
"fast-xml-parser": "^4.5.0",
"jest": "29.7.0",
"ncp": "^2.0.0",
"rimraf": "6.0.1",
"ts-node": "10.9.2",
"typescript": "5.7.2"
"typescript": "5.7.2",
"xsd-schema-validator": "^0.10.0"
}
}
34 changes: 7 additions & 27 deletions test/main.test.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,11 @@
import { readFileSync } from "fs";
import { join } from "path";
import { validateXML } from "xsd-schema-validator";
import config from "../configLoader";
import { makeSitemap } from "../lib/utils";
import { Hawk } from "../lib/core";
import validateSitemap from "./utils/validate-sitemap";

async function _validateSitemap(): Promise<boolean> {
//Generate site map
await makeSitemap(true, [], [], true);

/* Loading sitemap.xml */
const sitemapPath: string = config.sitemapPath;
const sitemapXML: string = readFileSync(sitemapPath, {
encoding: "utf8",
});

const sitemapSchemaFile: string = join(__dirname, "sitemap-schema.xsd");

/* Validating */
try {
const result = await validateXML(sitemapXML, sitemapSchemaFile);
return result.valid;
} catch (err) {
console.log(err);
return false;
}
}
const hawkInstance = new Hawk();
const testSampleRootPath = "./test/test-sample";

test("Sitemap.xml validation", async () => {
expect(await _validateSitemap()).toBe(true);
expect(await validateSitemap(testSampleRootPath, hawkInstance)).toBe(
true,
);
});
File renamed without changes.
File renamed without changes.
File renamed without changes.
232 changes: 116 additions & 116 deletions test/sitemap-schema.xsd → test/test-sample/sitemap-schema.xsd
Original file line number Diff line number Diff line change
@@ -1,116 +1,116 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
targetNamespace="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
elementFormDefault="qualified">
<xsd:annotation>
<xsd:documentation>
XML Schema for Sitemap files.
Last Modifed 2008-03-26
</xsd:documentation>
</xsd:annotation>

<xsd:element name="urlset">
<xsd:annotation>
<xsd:documentation>
Container for a set of up to 50,000 document elements.
This is the root element of the XML file.
</xsd:documentation>
</xsd:annotation>
<xsd:complexType>
<xsd:sequence>
<xsd:any namespace="##other" minOccurs="0" maxOccurs="unbounded" processContents="strict"/>
<xsd:element name="url" type="tUrl" maxOccurs="unbounded"/>
</xsd:sequence>
</xsd:complexType>
</xsd:element>

<xsd:complexType name="tUrl">
<xsd:annotation>
<xsd:documentation>
Container for the data needed to describe a document to crawl.
</xsd:documentation>
</xsd:annotation>
<xsd:sequence>
<xsd:element name="loc" type="tLoc"/>
<xsd:element name="lastmod" type="tLastmod" minOccurs="0"/>
<xsd:element name="changefreq" type="tChangeFreq" minOccurs="0"/>
<xsd:element name="priority" type="tPriority" minOccurs="0"/>
<xsd:any namespace="##other" minOccurs="0" maxOccurs="unbounded" processContents="strict"/>
</xsd:sequence>
</xsd:complexType>

<xsd:simpleType name="tLoc">
<xsd:annotation>
<xsd:documentation>
REQUIRED: The location URI of a document.
The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt).
</xsd:documentation>
</xsd:annotation>
<xsd:restriction base="xsd:anyURI">
<xsd:minLength value="12"/>
<xsd:maxLength value="2048"/>
</xsd:restriction>
</xsd:simpleType>

<xsd:simpleType name="tLastmod">
<xsd:annotation>
<xsd:documentation>
OPTIONAL: The date the document was last modified. The date must conform
to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime).
Example: 2005-05-10
Lastmod may also contain a timestamp.
Example: 2005-05-10T17:33:30+08:00
</xsd:documentation>
</xsd:annotation>
<xsd:union>
<xsd:simpleType>
<xsd:restriction base="xsd:date"/>
</xsd:simpleType>
<xsd:simpleType>
<xsd:restriction base="xsd:dateTime"/>
</xsd:simpleType>
</xsd:union>
</xsd:simpleType>

<xsd:simpleType name="tChangeFreq">
<xsd:annotation>
<xsd:documentation>
OPTIONAL: Indicates how frequently the content at a particular URL is
likely to change. The value "always" should be used to describe
documents that change each time they are accessed. The value "never"
should be used to describe archived URLs. Please note that web
crawlers may not necessarily crawl pages marked "always" more often.
Consider this element as a friendly suggestion and not a command.
</xsd:documentation>
</xsd:annotation>
<xsd:restriction base="xsd:string">
<xsd:enumeration value="always"/>
<xsd:enumeration value="hourly"/>
<xsd:enumeration value="daily"/>
<xsd:enumeration value="weekly"/>
<xsd:enumeration value="monthly"/>
<xsd:enumeration value="yearly"/>
<xsd:enumeration value="never"/>
</xsd:restriction>
</xsd:simpleType>

<xsd:simpleType name="tPriority">
<xsd:annotation>
<xsd:documentation>
OPTIONAL: The priority of a particular URL relative to other pages
on the same site. The value for this element is a number between
0.0 and 1.0 where 0.0 identifies the lowest priority page(s).
The default priority of a page is 0.5. Priority is used to select
between pages on your site. Setting a priority of 1.0 for all URLs
will not help you, as the relative priority of pages on your site
is what will be considered.
</xsd:documentation>
</xsd:annotation>
<xsd:restriction base="xsd:decimal">
<xsd:minInclusive value="0.0"/>
<xsd:maxInclusive value="1.0"/>
</xsd:restriction>
</xsd:simpleType>

</xsd:schema>
<?xml version="1.0" encoding="UTF-8"?>
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
targetNamespace="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
elementFormDefault="qualified">
<xsd:annotation>
<xsd:documentation>
XML Schema for Sitemap files.
Last Modifed 2008-03-26
</xsd:documentation>
</xsd:annotation>

<xsd:element name="urlset">
<xsd:annotation>
<xsd:documentation>
Container for a set of up to 50,000 document elements.
This is the root element of the XML file.
</xsd:documentation>
</xsd:annotation>
<xsd:complexType>
<xsd:sequence>
<xsd:any namespace="##other" minOccurs="0" maxOccurs="unbounded" processContents="strict"/>
<xsd:element name="url" type="tUrl" maxOccurs="unbounded"/>
</xsd:sequence>
</xsd:complexType>
</xsd:element>

<xsd:complexType name="tUrl">
<xsd:annotation>
<xsd:documentation>
Container for the data needed to describe a document to crawl.
</xsd:documentation>
</xsd:annotation>
<xsd:sequence>
<xsd:element name="loc" type="tLoc"/>
<xsd:element name="lastmod" type="tLastmod" minOccurs="0"/>
<xsd:element name="changefreq" type="tChangeFreq" minOccurs="0"/>
<xsd:element name="priority" type="tPriority" minOccurs="0"/>
<xsd:any namespace="##other" minOccurs="0" maxOccurs="unbounded" processContents="strict"/>
</xsd:sequence>
</xsd:complexType>

<xsd:simpleType name="tLoc">
<xsd:annotation>
<xsd:documentation>
REQUIRED: The location URI of a document.
The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt).
</xsd:documentation>
</xsd:annotation>
<xsd:restriction base="xsd:anyURI">
<xsd:minLength value="12"/>
<xsd:maxLength value="2048"/>
</xsd:restriction>
</xsd:simpleType>

<xsd:simpleType name="tLastmod">
<xsd:annotation>
<xsd:documentation>
OPTIONAL: The date the document was last modified. The date must conform
to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime).
Example: 2005-05-10
Lastmod may also contain a timestamp.
Example: 2005-05-10T17:33:30+08:00
</xsd:documentation>
</xsd:annotation>
<xsd:union>
<xsd:simpleType>
<xsd:restriction base="xsd:date"/>
</xsd:simpleType>
<xsd:simpleType>
<xsd:restriction base="xsd:dateTime"/>
</xsd:simpleType>
</xsd:union>
</xsd:simpleType>

<xsd:simpleType name="tChangeFreq">
<xsd:annotation>
<xsd:documentation>
OPTIONAL: Indicates how frequently the content at a particular URL is
likely to change. The value "always" should be used to describe
documents that change each time they are accessed. The value "never"
should be used to describe archived URLs. Please note that web
crawlers may not necessarily crawl pages marked "always" more often.
Consider this element as a friendly suggestion and not a command.
</xsd:documentation>
</xsd:annotation>
<xsd:restriction base="xsd:string">
<xsd:enumeration value="always"/>
<xsd:enumeration value="hourly"/>
<xsd:enumeration value="daily"/>
<xsd:enumeration value="weekly"/>
<xsd:enumeration value="monthly"/>
<xsd:enumeration value="yearly"/>
<xsd:enumeration value="never"/>
</xsd:restriction>
</xsd:simpleType>

<xsd:simpleType name="tPriority">
<xsd:annotation>
<xsd:documentation>
OPTIONAL: The priority of a particular URL relative to other pages
on the same site. The value for this element is a number between
0.0 and 1.0 where 0.0 identifies the lowest priority page(s).
The default priority of a page is 0.5. Priority is used to select
between pages on your site. Setting a priority of 1.0 for all URLs
will not help you, as the relative priority of pages on your site
is what will be considered.
</xsd:documentation>
</xsd:annotation>
<xsd:restriction base="xsd:decimal">
<xsd:minInclusive value="0.0"/>
<xsd:maxInclusive value="1.0"/>
</xsd:restriction>
</xsd:simpleType>

</xsd:schema>
87 changes: 87 additions & 0 deletions test/utils/validate-sitemap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { XMLParser } from "fast-xml-parser";
import { globSync } from "glob";
import { existsSync, readFileSync, rmSync } from "node:fs";
import { join } from "node:path";
import { validateXML } from "xsd-schema-validator";
import { type Hawk } from "../../lib/core";

export default async function validateSitemap(
testSampleRootPath: string,
hawkInstance: Hawk,
): Promise<boolean> {
process.chdir(testSampleRootPath);

const lookupPattern = ["**/*.html"];
const uploadToFTP = false;
const expectedSitemapOutputPath = "test-sitemap.xml";

hawkInstance.configurations.sitemapPath = expectedSitemapOutputPath;
await hawkInstance.utils.makeSitemap(
lookupPattern,
[],
false,
uploadToFTP,
);

//check sitemap if exist
const siteMapExist = existsSync(expectedSitemapOutputPath);

if (siteMapExist) {
//validate sitemap with schema
const sitemapXML: string = readFileSync(expectedSitemapOutputPath, {
encoding: "utf8",
});

//delete sitemap as no longer needed
rmSync(expectedSitemapOutputPath, { recursive: true, force: true });

const sitemapSchemaFile = "sitemap-schema.xsd";

const { valid } = await validateXML(sitemapXML, sitemapSchemaFile);

if (valid) {
//check number of available routes against nof available files;
const parser = new XMLParser();
const parsed = parser.parse(sitemapXML);

const urls = parsed.urlset.url.map(
(url: { loc: string; lastmod: string }) => url.loc,
);
const availableRoutes = globSync(lookupPattern);

const numberOfRoutesinMap = urls.length;
const numberOfFiles = availableRoutes.length;

const expectedRoutesCount = numberOfFiles === numberOfRoutesinMap;

if (expectedRoutesCount) {
//ping to all routes if any failed return false
return _pingRoutes(urls);
}
} else {
console.log("⚠️ Sitemap failed at schematic test");
}
} else {
console.log("⚠️ Sitemap not found!");
}

return false;
}

function _pingRoutes(urls: string[]): boolean {
return urls.every((url: string) => {
let { pathname } = new URL(url);

if (pathname === "/") pathname = "index";

const filePath = join(process.cwd(), pathname + ".html");

const goodRoute = existsSync(filePath);

if (!goodRoute) {
console.log(`⚠️ Ping failed on: ${url}`);
}

return goodRoute;
});
}
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,6 @@
"node_modules",
"test",
"jest.config.ts",
"dist"
]
}
Loading

0 comments on commit f26510e

Please sign in to comment.