From bb2169ebf69e9860717b57b05914d16e50f9c90a Mon Sep 17 00:00:00 2001 From: rvogel Date: Mon, 3 Jun 2024 17:31:38 +0200 Subject: [PATCH] Add unconvertable content to the result There may be legacy Confluence WikiText contents included in the export, which can not be converted by this tool. In this case, we just want to add them as raw text to the result. --- src/Converter/ConfluenceConverter.php | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Converter/ConfluenceConverter.php b/src/Converter/ConfluenceConverter.php index 34dd1b6..5679040 100644 --- a/src/Converter/ConfluenceConverter.php +++ b/src/Converter/ConfluenceConverter.php @@ -6,6 +6,7 @@ use DOMElement; use DOMNode; use DOMXPath; +use Exception; use HalloWelt\MediaWiki\Lib\Migration\Converter\PandocHTML; use HalloWelt\MediaWiki\Lib\Migration\DataBuckets; use HalloWelt\MediaWiki\Lib\Migration\IOutputAwareInterface; @@ -167,7 +168,13 @@ protected function doConvert( SplFileInfo $file ): string { $this->currentPageTitle = 'not_current_revision_' . $pageId; } - $dom = $this->preprocessFile(); + try { + $dom = $this->preprocessFile(); + } + catch ( Exception $e ) { + $rawContent = file_get_contents( $this->rawFile->getPathname() ); + return "<-- Unconvertable RAW start-->\n$rawContent\n<-- Unconvertable RAW start-->\n[[Category:Unconvertable]]"; + } $xpath = new DOMXPath( $dom ); $xpath->registerNamespace( 'ac', 'some' ); @@ -325,7 +332,10 @@ private function preprocessFile() { $dom->formatOutput = true; $dom->preserveWhiteSpace = true; $dom->validateOnParse = false; - $dom->loadXML( $source, LIBXML_PARSEHUGE ); + $validXML = $dom->loadXML( $source, LIBXML_PARSEHUGE ); + if ( $validXML === false ) { + throw new Exception( 'Unconvertable'); + } $preprocessedPathname = str_replace( '.mraw', '.mprep', $this->rawFile->getPathname() ); $dom->saveHTMLFile( $preprocessedPathname );