1 ) { array_pop( $patharr ); foreach( $patharr as $pathelem ) { if ( strlen($parent_id) > 0 ) { $parent_id .= "/".$pathelem; } else { $parent_id = $pathelem; } } } // document id and parent document id $id = str_replace( "/", "-", $id ); $parent_id = str_replace( "/", "-", $parent_id ); if ( $processFiles ) { // fix xhtml errors exec("tidy -config config.txt -o ".$targetfile." ".$file ); } // load xhtml and fix tags $dom = new DomDocument('1.0', 'UTF-8'); $dom->loadHTMLFile( $targetfile ); if ( $processFiles ) { // delete scripts deleteNodes( $dom ); // fix links to styles $nodes = getNodesByName( $dom, "link"); foreach( $nodes as $e ) { if( $e->hasAttribute("href")) { $e->setAttribute("href", basename( $e->getAttribute("href"))); } if( $e->getAttribute("rel") != "stylesheet" ) { $e->parentNode->removeChild( $e ); } } // remove divs with internal info > 1mb $nodes = getNodesByName( $dom, "div"); foreach( $nodes as $e ) { if( $e->getAttribute("class") == "printfooter" || strpos( $e->getAttribute("class"), "catlinks" ) !== false || strpos( $e->getAttribute("class"), "coliru-btn" ) !== false ) { $e->parentNode->removeChild( $e ); } else if ( $e->getAttribute('id') == "siteSub" || $e->getAttribute('id') == "mw-js-message") { $e->parentNode->removeChild( $e ); } // remove navbar? if ($e->getAttribute("class") == "t-navbar") { $e->parentNode->removeChild( $e ); } } // fix links to other pages $nodes = array_merge( getNodesByName( $dom, "a"), getNodesByName( $dom, "area")); foreach( $nodes as $e ) { if( $e->hasAttribute("title")) { $href = $e->getAttribute("href"); $hash = false; $hashes = explode('#', $href, 2); if (count($hashes) === 2) { $hash = $hashes[1]; } if ( strpos( $href, "http") === false ) { $new_value = str_replace( "/" , "-", $e->getAttribute("title")); // fix bad titles where space is used instead of _ $new_value = str_replace( " " , "_", $new_value); if ($hash) { $e->setAttribute("href", $new_value.".html#".$hash ); } else { $e->setAttribute("href", $new_value.".html" ); } // remove title from links? $e->removeAttribute("title"); } } else if( $e->hasAttribute("href")) { $href = $e->getAttribute("href"); $hash = false; $hashes = explode('#', $href, 2); if (count($hashes) === 2) { $href = $hashes[0]; $hash = $hashes[1]; } if ( strpos( $href, "http") === false ) { // bad link $noTitleLinks++; $relativenamefix = str_replace( DIRECTORY_SEPARATOR, "/", $relativename ); $relative_folders = explode( "/" , $relativenamefix ); $unFolders = substr_count( $href , "../" ); $hrefname = str_replace( "../", "", $href ); $hrefname = str_replace( ".html", "", $hrefname ); $unFolders++; for ( $ind = 0; $ind < $unFolders; $ind++) { array_pop( $relative_folders ); } $fref_folders = explode( "/" , $hrefname ); foreach ($fref_folders as $frf) { $relative_folders[] = $frf; } $fixedhref = implode( "-", $relative_folders ); if ($hash) { // fix hashes $e->setAttribute("href", $fixedhref.".html#".$hash ); } else { $e->setAttribute("href", $fixedhref.".html" ); } } } } // remove unwanted colspan="5", about 4mb of text! $nodes = getNodesByName( $dom, "td"); foreach( $nodes as $e ) { if( $e->hasAttribute("colspan") && $e->getAttribute("colspan") == 5 ) { $e->removeAttribute("colspan"); $colspans++; } } // fix links to images $nodes = getNodesByName( $dom, "img"); foreach( $nodes as $e ) { if( $e->hasAttribute("src")) { $e->setAttribute("src", basename( $e->getAttribute("src"))); } } } // getting and fixing document title remplace