From 7b9c9d4f5ec8f83c0bcccc95872d032806a70d1b Mon Sep 17 00:00:00 2001 From: Chris Hennes Date: Sun, 31 Jan 2021 17:24:49 -0600 Subject: [PATCH] Add tests for Start Page HTML validity This test is in two parts: first the generated HTML is sanitized to remove any potentially sensitive information (e.g. filenames, authors, document info, etc.) and is then sent to the W3C Validator service at https://validator.w3.org/nu. The results are interrogated and if any errors or warnings are returned, the test fails. If the site cannot be reached this is NOT treated as a test failure. Second, the actual (unsanitized) filenames are checked for validity: the HTML standard prohibits backslashes in URLs, even if the URL refers to a local file on a system that uses backslashes as a path separator (e.g. Windows). This would have been caught by the W3C Validator if we had not sanitized the filenames. --- src/Mod/Start/TestStart/TestStartPage.py | 100 ++++++++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/src/Mod/Start/TestStart/TestStartPage.py b/src/Mod/Start/TestStart/TestStartPage.py index 9e0740a658..7cc1da5761 100644 --- a/src/Mod/Start/TestStart/TestStartPage.py +++ b/src/Mod/Start/TestStart/TestStartPage.py @@ -24,7 +24,7 @@ import unittest import FreeCAD import Start from StartPage import StartPage -from html.parser import HTMLParser +import re class TestStartPage(unittest.TestCase): """Basic validation of the generated Start page.""" @@ -35,6 +35,7 @@ class TestStartPage(unittest.TestCase): def setUp(self): pass + def test_all_css_placeholders_removed(self): """Check to see if all of the CSS placeholders have been replaced.""" placeholders = ["BACKGROUND","BGTCOLOR","FONTFAMILY","FONTSIZE","LINKCOLOR", @@ -44,6 +45,7 @@ class TestStartPage(unittest.TestCase): for placeholder in placeholders: self.assertNotIn (placeholder, page, "{} was not removed from the CSS".format(placeholder)) + def test_all_js_placeholders_removed(self): """Check to see if all of the JavaScript placeholders have been replaced.""" placeholders = ["IMAGE_SRC_INSTALLED"] @@ -51,6 +53,7 @@ class TestStartPage(unittest.TestCase): for placeholder in placeholders: self.assertNotIn (placeholder, page, "{} was not removed from the JS".format(placeholder)) + def test_all_html_placeholders_removed(self): """Check to see if all of the HTML placeholders have been replaced.""" placeholders = ["T_TITLE","VERSIONSTRING","T_DOCUMENTS","T_HELP","T_ACTIVITY", @@ -69,4 +72,97 @@ class TestStartPage(unittest.TestCase): page = StartPage.handle() for placeholder in placeholders: self.assertNotIn (placeholder, page, "{} was not removed from the HTML".format(placeholder)) - \ No newline at end of file + + + def test_files_do_not_contain_backslashes(self): + # This would be caught by the W3C validator if we didn't sanitize the filenames before sending them. + page = StartPage.handle() + fileRE = re.compile(r'"file:///(.*?)"') + results = fileRE.findall(string=page) + + badFilenames = [] + for result in results: + if result.find("\\") != -1: + badFilenames.append(result) + + if len(badFilenames) > 0: + self.fail("The following filenames contain backslashes, which is prohibited in HTML: {}".format(badFilenames)) + + + def test_html_validates(self): + # Send the generated html to the W3C validator for analysis (removing potentially-sensitive data first) + import urllib.request + import os + import json + page = self.sanitize(StartPage.handle()) # Remove potentially sensitive data + + # For debugging, if you want to ensure that the sanitization worked correctly: + # from pathlib import Path + # home = str(Path.home()) + # f=open(home+"/test.html", "w") + # f.write(page) + # f.close() + + validation_url = "https://validator.w3.org/nu/?out=json" + data = page.encode('utf-8') # data should be bytes + req = urllib.request.Request(validation_url, data) + req.add_header("Content-type","text/html; charset=utf-8") + errorCount = 0 + warningCount = 0 + infoCount = 0 + validationResultString = "" + try: + with urllib.request.urlopen (req) as response: + text = response.read() + + responseJSON = json.loads(text) + + for message in responseJSON["messages"]: + if "type" in message: + if message["type"] == "info": + if "subtype" in message: + if message["subtype"] == "warning": + warningCount += 1 + validationResultString += "WARNING: {}\n".format(ascii(message["message"])) + else: + infoCount += 1 + validationResultString += "INFO: {}\n".format(ascii(message["message"])) + elif message["type"] == "error": + errorCount += 1 + validationResultString += "ERROR: {}\n".format(ascii(message["message"])) + elif message["type"] == "non-document-error": + FreeCAD.Console.PrintWarning("W3C validator returned a non-document error:\n {}".format(message)) + return + + except urllib.error.HTTPError as e: + FreeCAD.Console.PrintWarning("W3C validator returned response code {}".format(e.code)) + + except urllib.error.URLError: + FreeCAD.Console.PrintWarning("Could not communicate with W3C validator") + + if errorCount > 0 or warningCount > 0: + StartPage.exportTestFile() + FreeCAD.Console.PrintWarning("HTML validation failed: Start page source written to your home directory for analysis.") + self.fail("W3C Validator analysis shows the Start page has {} errors and {} warnings:\n\n{}".format(errorCount, warningCount, validationResultString)) + elif infoCount > 0: + FreeCAD.Console.PrintWarning("The Start page is valid HTML, but the W3C sent back {} informative messages:\n{}.".format(infoCount,validationResultString)) + + def sanitize (self, html): + + # Anonymize all local filenames + fileRE = re.compile(r'"file:///.*?"') + html = fileRE.sub(repl=r'"file:///A/B/C"', string=html) + + # Anonymize titles, which are used for mouseover text and might contain document information + titleRE = re.compile(r'title="[\s\S]*?"') # Some titles have newlines in them + html = titleRE.sub(repl=r'title="Y"', string=html) + + # Anonymize the document names, which we display in

tags + h4RE = re.compile(r'

.*?

') + html = h4RE.sub(repl=r'

Z

', string=html) + + # Remove any simple single-line paragraphs, which might contain document author information, file size information, etc. + pRE = re.compile(r'

[^<]*?

') + html = pRE.sub(repl=r'

X

', string=html) + + return html