Skip to content

Commit

Permalink
updated confluence scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
mishraomp committed Jan 18, 2024
1 parent 6204dba commit faaa5d7
Show file tree
Hide file tree
Showing 3 changed files with 264 additions and 25 deletions.
85 changes: 62 additions & 23 deletions confluence-scraper/index.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
'use strict'
import { setOutput } from '@actions/core';
import fs from 'fs';
import { promises as fs_promises } from 'fs';
import {setOutput} from '@actions/core';
import fs, {promises as fs_promises} from 'fs';
import TurndownService from 'turndown';
import http from 'https';
import axios from 'axios';
const turndownService = new TurndownService();
import { tidy } from 'htmltidy2';
import {load} from 'cheerio';

const turndownService = new TurndownService();
import turndownPluginGfm from 'turndown-plugin-gfm';
const gfm = turndownPluginGfm.gfm;
turndownService.use(gfm);
const server = http.createServer();
server.listen(3002); // just to make the process does not exit.
const BASE_URL = process.env.BASE_URL;
Expand All @@ -18,35 +22,70 @@ async function processPageIdList() {
password: process.env.CONFLUENCE_TOKEN
}
};
let i=1;
let isUpdated = false;
let i = 1;
let isUpdated = false; // used to set the output variable for GitHub action.
for (const page_id of PAGE_ID_LIST) {
let updateIndividually = false;
try {
const sideBar = `---\nsidebar_position: ${i++}\n---\n`;
const response = await axios.get(`${BASE_URL}/rest/api/content/${page_id}?expand=body.storage`, options);
const folderPath = `../patterns/docs/${response.data.title}`;
const filePath = `${folderPath}/data.json`;
if (fs.existsSync(filePath)) {
const fileData = await fs_promises.readFile(filePath,'utf-8');
if (fileData !== JSON.stringify(response.data)) {
await fs_promises.writeFile(filePath, JSON.stringify(response?.data));
const markdown=turndownService.turndown(response?.data?.body?.storage?.value);
await fs_promises.writeFile(folderPath + `/${response.data.title}.md`, sideBar+markdown);
if(!isUpdated){
const html = response?.data?.body?.storage?.value;
const $ = load(html);
// Remove all blank elements
$('*').each((i, el) => {
if ($(el).children().length === 0 && $(el).text().trim() === '') {
$(el).remove();
}
});

// Remove all styles from span elements
$('span').replaceWith(function() {
return $(this).contents();
});

// Get the updated HTML
const updatedHtml = $.html();
tidy(updatedHtml, {
indent: true,
bare: true,
breakBeforeBr: true,
fixUri: true,
wrap: 0,

doctype: 'html5',
hideComments: false, // multi word options can use a hyphen or "camel case"
},async (err, formattedHtml) => {
if (err) {
console.error(err);
process.exit(1);
}
const markdown = turndownService.turndown(formattedHtml);
if (fs.existsSync(filePath)) {
const fileData = await fs_promises.readFile(filePath, 'utf-8');
if (fileData !== JSON.stringify(response.data)) {
if (!isUpdated) {
isUpdated = true;
}
updateIndividually = true;
} else {
console.info('It is already the latest version', page_id);
}
} else {
fs.mkdirSync(folderPath, {recursive: true});
updateIndividually = true;
if (!isUpdated) {
isUpdated = true;
}
}else{
console.info('It is already the latest version', page_id);
}
} else {
fs.mkdirSync(folderPath, { recursive: true });
await fs_promises.writeFile(filePath, JSON.stringify(response?.data));
const markdown=turndownService.turndown(response?.data?.body?.storage?.value);
await fs_promises.writeFile(folderPath + `/${response.data.title}.md`, sideBar+markdown);
if(!isUpdated){
isUpdated = true;
if (updateIndividually) {
await fs_promises.writeFile(filePath, JSON.stringify(response?.data));
await fs_promises.writeFile(folderPath + `/${response.data.title}.md`, sideBar + markdown);
}
}

});

} catch (err) {
console.error(err);
process.exit(1);
Expand Down
199 changes: 198 additions & 1 deletion confluence-scraper/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion confluence-scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@
},
"homepage": "https://github.com/bcgov/nr-arch-templates#readme",
"dependencies": {
"@actions/core": "^1.10.0",
"axios": "1.6.5",
"cheerio": "^1.0.0-rc.12",
"htmltidy2": "^1.1.1",
"turndown": "^7.1.2",
"@actions/core": "^1.10.0"
"turndown-plugin-gfm": "^1.0.2"
}
}

0 comments on commit faaa5d7

Please sign in to comment.