Tools: Removing obsolete wiki-to-qhelp scripts

Note: A copy of these scripts is kept
at https://github.com/yorikvanhavre/FreeCAD-offlinedoc-scripts
for archiving
This commit is contained in:
Yorik van Havre
2022-03-24 09:46:50 +01:00
parent aae0201a8b
commit a4ea92e719
7 changed files with 0 additions and 1430 deletions

View File

@@ -1,41 +0,0 @@
this suite of tools can be used to retrieve a local copy
from the FreeCAD wiki and then use it to generate qhelp
and pdf files. The downloading of the entire wiki is now
a huge operation, prone to network errors, so it has been
cut into 2 parts, one to retrieve a list of files to
download and another to actually download the files.
1) run "buildwikiindex.py" to build an index file containing
a list of all the files to download
2) run "downloadwiki.py". If connection drops, run it again,
the already downloaded files will be skipped.
2b) Dirty hack: run "fixlinks.py" to fix wrong html links
(downloadwiki.py should be fixed in the future)
3) run "buildqhelp.py" to generate freecad.qhc and freecad.qch
files
4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)
5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
6) If you have already downloaded the whole wiki, run "update.py" immediately
after, to create a list of revision IDs for each page.
7) Once the initial revisions list has been created, the "update.py" script
can be ran anytime in the future, to check for pages that have changed
since the stored revision ID. The script is meant to run twice, one to get
a list of pages that have changed, and another one to download the changed
pages (and all their dependencies) again.
8) To split the generated freecad.qch into parts that are smaller than 50Mb
(github limit): split -d --byte=49M localwiki/freecad.qch localwiki/freecad.qch.part
9) To join the parts again (for testing): cat localwiki/freecad.qch.part* >> test.qch
Then check that test.qch has the same md5 number than localwiki/freecad.qch
10) To test: assistant -collectionFile localwiki/freecad.qhc

View File

@@ -1,322 +0,0 @@
#!/usr/bin/env python
#***************************************************************************
#* *
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
#* *
#* This program is free software; you can redistribute it and/or modify *
#* it under the terms of the GNU Lesser General Public License (LGPL) *
#* as published by the Free Software Foundation; either version 2 of *
#* the License, or (at your option) any later version. *
#* for detail see the LICENCE text file. *
#* *
#* This program is distributed in the hope that it will be useful, *
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
#* GNU Library General Public License for more details. *
#* *
#* You should have received a copy of the GNU Library General Public *
#* License along with this program; if not, write to the Free Software *
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
#* USA *
#* *
#***************************************************************************
__title__="buildpdf"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://www.freecadweb.org"
"""
This script builds a pdf file from a local copy of the wiki
"""
Workbenches=["Part","PartDesign","Sketcher","Constraints","Draft","Path","Fem","Arch","TechDraw","Raytracing","OpenSCAD","Robot","Mesh"]
TOC="""Online_Help_Startpage
About_FreeCAD
Feature_list
Installing
Getting_started
Mouse_Model
Document_structure
Property_editor
Import_Export
Workbenches
WorkbenchesList
Interface_Customization
Preferences_Editor
Macros
Introduction_to_Python
Python_scripting_tutorial
Topological_data_scripting
Mesh_Scripting
Mesh_to_Part
Scenegraph
Pivy
begin
PySide
PySide_Beginner_Examples
PySide_Medium_Examples
PySide_Advanced_Examples
end
Scripted_objects
Embedding_FreeCAD
Embedding_FreeCADGui
Code_snippets"""
import sys, os, shutil, time
from urllib.request import urlopen
# CONFIGURATION #################################################
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
PDFCONVERTOR = 'wkhtmltopdf' # can be 'pisa', 'htmldoc', 'wkhtmltopdf' or 'firefox'
VERBOSE = True # set true to get output messages
INCLUDECOMMANDS = True # if true, the command pages of each workbench are included after each WB page
OVERWRITE = False # if true, pdf files are recreated even if already existing
FIREFOXPDFFOLDER = os.path.expanduser("~")+os.sep+"PDF" # if firefox is used, set this to where it places its pdf files by default
COVER = "http://www.freecadweb.org/wiki/images/7/79/Freecad-pdf-cover.svg"
# END CONFIGURATION ##############################################
FOLDER = "./localwiki"
fcount = dcount = 0
def crawl():
"creates a pdf file from the localwiki folder"
# tests ###############################################
if PDFCONVERTOR == 'pisa':
try:
import ho.pisa as pisa
except Exception:
"Error: Python-pisa not installed, exiting."
return 1
elif PDFCONVERTOR == 'htmldoc':
if os.system('htmldoc --version'):
print("Error: Htmldoc not found, exiting.")
return 1
try:
from PyPDF2 import PdfFileReader,PdfFileWriter
except Exception:
print("Error: Python-pypdf2 not installed, exiting.")
# run ########################################################
buildpdffiles()
joinpdf()
if VERBOSE: print("All done!")
return 0
def buildpdffiles():
"scans a folder for html files and converts them all to pdf"
templist = os.listdir(FOLDER)
if PDFCONVERTOR == 'wkhtmltopdf':
makeStyleSheet()
global fileslist
fileslist = []
for i in templist:
if i[-5:] == '.html':
fileslist.append(i)
print("converting ",len(fileslist)," pages")
i = 1
for f in fileslist:
print(i," : ",f)
if PDFCONVERTOR == 'pisa':
createpdf_pisa(f[:-5])
elif PDFCONVERTOR == 'wkhtmltopdf':
createpdf_wkhtmltopdf(f[:-5])
elif PDFCONVERTOR == 'firefox':
createpdf_firefox(f[:-5])
else:
createpdf_htmldoc(f[:-5])
i += 1
def fetch_resources(uri, rel):
"""
Callback to allow pisa/reportlab to retrieve Images,Stylesheets, etc.
'uri' is the href attribute from the html link element.
'rel' gives a relative path, but it's not used here.
Note from Yorik: Not working!!
"""
path = os.path.join(FOLDER,uri.replace("./", ""))
return path
def createpdf_pisa(pagename):
"creates a pdf file from a saved page using pisa (python module)"
import ho.pisa as pisa
if (not exists(pagename+".pdf",image=True)) or OVERWRITE:
infile = open(FOLDER + os.sep + pagename+'.html','ro')
outfile = open(FOLDER + os.sep + pagename+'.pdf','wb')
if VERBOSE: print("Converting " + pagename + " to pdf...")
pdf = pisa.CreatePDF(infile,outfile,FOLDER,link_callback=fetch_resources)
outfile.close()
if pdf.err:
return pdf.err
return 0
def createpdf_firefox(pagename):
"creates a pdf file from a saved page using firefox (needs command line printing extension)"
# the default printer will be used, so make sure it is set to pdf
# command line printing extension http://forums.mozillazine.org/viewtopic.php?f=38&t=2729795
if (not exists(pagename+".pdf",image=True)) or OVERWRITE:
infile = FOLDER + os.sep + pagename+'.html'
outfile = FOLDER + os.sep + pagename+'.pdf'
os.system('firefox -print ' + infile)
time.sleep(6)
if os.path.exists(FIREFOXPDFFOLDER + os.sep + pagename + ".pdf"):
shutil.move(FIREFOXPDFFOLDER+os.sep+pagename+".pdf",outfile)
else:
print("-----------------------------------------> Couldn't find print output!")
def createpdf_htmldoc(pagename):
"creates a pdf file from a saved page using htmldoc (external app, but supports images)"
if (not exists(pagename+".pdf",image=True)) or OVERWRITE:
infile = FOLDER + os.sep + pagename+'.html'
outfile = FOLDER + os.sep + pagename+'.pdf'
return os.system('htmldoc --webpage --textfont sans --browserwidth 840 -f '+outfile+' '+infile)
def createpdf_wkhtmltopdf(pagename):
"creates a pdf file from a saved page using htmldoc (external app, but supports images)"
if (not exists(pagename+".pdf",image=True)) or OVERWRITE:
infile = FOLDER + os.sep + pagename+'.html'
outfile = FOLDER + os.sep + pagename+'.pdf'
cmd = 'wkhtmltopdf -L 5mm --user-style-sheet '+FOLDER+os.sep+'wkhtmltopdf.css '+infile+' '+outfile
print(cmd)
#return os.system(cmd)
else:
print("skipping")
def joinpdf():
"creates one pdf file from several others, following order from the cover"
from PyPDF2 import PdfFileReader,PdfFileWriter
if VERBOSE: print("Building table of contents...")
result = PdfFileWriter()
createCover()
inputfile = PdfFileReader(open(FOLDER+os.sep+'Cover.pdf','rb'))
result.addPage(inputfile.getPage(0))
count = 1
tocfile = createTOC()
parent = False
for page in tocfile:
page = page.strip()
if page:
if page[0] == "#":
continue
if page == "begin":
parent = True
continue
if page == "end":
parent = False
continue
if VERBOSE: print('Appending ',page, " at position ",count)
title = page.replace("_"," ")
pdffile = page + ".pdf"
if exists(pdffile,True):
inputfile = PdfFileReader(open(FOLDER + os.sep + pdffile,'rb'))
numpages = inputfile.getNumPages()
for i in range(numpages):
result.addPage(inputfile.getPage(i))
if parent == True:
parent = result.addBookmark(title,count)
elif parent == False:
result.addBookmark(title,count)
else:
result.addBookmark(title,count,parent)
count += numpages
else:
print("page ",pdffile," not found, aborting.")
sys.exit()
if VERBOSE: print("Writing...")
outputfile = open(FOLDER+os.sep+"freecad.pdf",'wb')
result.write(outputfile)
outputfile.close()
if VERBOSE:
print(' ')
print('Successfully created '+FOLDER+os.sep+'freecad.pdf')
def createTOC():
"populates the TOC"
tocfile = TOC.split("\n")
files = [f for f in os.listdir(FOLDER) if f.endswith(".pdf")]
wbpages = []
for wb in Workbenches:
wbpage += "begin"
if wb+"_Workbench" in files:
wbpages.append(wb+"_Workbench")
for f in files:
if f.lower().startswith(wb.lower()+"_"):
if (not f.lower().endswith("_workbench")) and (not f.lower().endswith("tutorial")):
wb.append(f)
if wb+"_tutorial" in files:
wbpages.append(wb+"_tutorial")
wbpages.append("end")
toc = []
for i in tocfile:
if i == "WorkbenchesList":
toc.extend(wbpages)
else:
toc.append(i)
return toc
def local(page,image=False):
"returns a local path for a given page/image"
if image:
return FOLDER + os.sep + page
else:
return FOLDER + os.sep + page + '.html'
def exists(page,image=False):
"checks if given page/image already exists"
path = local(page,image)
if os.path.exists(path): return True
return False
def makeStyleSheet():
"Creates a stylesheet for wkhtmltopdf"
outputfile = open(FOLDER+os.sep+"wkhtmltopdf.css",'wb')
outputfile.write("""
.printfooter {
display:none !important;
}
""")
outputfile.close()
def createCover():
"downloads and creates a cover page"
if VERBOSE: print("fetching " + COVER)
data = (urlopen(COVER).read())
path = FOLDER + os.sep + "Cover.svg"
fil = open(path,'wb')
fil.write(data)
fil.close()
os.system('inkscape --export-pdf='+FOLDER+os.sep+'Cover.pdf'+' '+FOLDER+os.sep+'Cover.svg')
if __name__ == "__main__":
crawl()

View File

@@ -1,246 +0,0 @@
#!/usr/bin/env python3
#***************************************************************************
#* *
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
#* *
#* This program is free software; you can redistribute it and/or modify *
#* it under the terms of the GNU Lesser General Public License (LGPL) *
#* as published by the Free Software Foundation; either version 2 of *
#* the License, or (at your option) any later version. *
#* for detail see the LICENCE text file. *
#* *
#* This program is distributed in the hope that it will be useful, *
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
#* GNU Library General Public License for more details. *
#* *
#* You should have received a copy of the GNU Library General Public *
#* License along with this program; if not, write to the Free Software *
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
#* USA *
#* *
#***************************************************************************
__title__="wiki2qhelp"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://www.freecadweb.org"
"""
This script builds qhrlp files from a local copy of the wiki
"""
import os, re, shutil
# CONFIGURATION #################################################
FOLDER = "./localwiki"
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
QHELPCOMPILER = 'qhelpgenerator'
RELEASE = '0.19'
# END CONFIGURATION ##############################################
fcount = dcount = 0
def crawl():
"downloads an entire wiki site"
# tests ###############################################
if os.system(QHELPCOMPILER +' -v'):
print ("Error: QAssistant not fully installed, exiting.")
return 1
# run ########################################################
qhp = buildtoc()
qhcp = createCollProjectFile()
shutil.copy("../../Gui/Icons/freecad-icon-64.png","localwiki/freecad-icon-64.png")
if generate(qhcp):
print ("Error while generating")
return 1
if compile(qhp):
print ("Error while compiling")
return 1
if VERBOSE: print ("All done!")
#i=raw_input("Copy the files to their correct location in the source tree? y/n (default=no) ")
#if i.upper() in ["Y","YES"]:
# shutil.copy("localwiki/freecad.qch","../../Doc/freecad.qch")
# shutil.copy("localwiki/freecad.qhc","../../Doc/freecad.qhc")
#else:
print ('Files freecad.qch and freecad.qhc are in localwiki. Test with "assistant -collectionFile localwiki/freecad.qhc"')
return 0
def compile(qhpfile):
"compiles the whole html doc with qassistant"
qchfile = FOLDER + os.sep + "freecad.qch"
if not os.system(QHELPCOMPILER + ' '+qhpfile+' -o '+qchfile):
if VERBOSE: print ("Successfully created",qchfile)
return 0
return 1
def generate(qhcpfile):
"generates qassistant-specific settings like icon, title, ..."
txt="""
<center>FreeCAD """+RELEASE+""" help files<br/>
<a href="http://www.freecadweb.org">http://www.freecadweb.org</a></center>
"""
about=open(FOLDER + os.sep + "about.txt","w")
about.write(txt)
about.close()
qhcfile = FOLDER + os.sep + "freecad.qhc"
if not os.system(QHELPCOMPILER+' '+qhcpfile+' -o '+qhcfile):
if VERBOSE: print ("Successfully created ",qhcfile)
return 0
return 1
def createCollProjectFile():
qprojectfile = '''<?xml version="1.0" encoding="UTF-8"?>
<QHelpCollectionProject version="1.0">
<assistant>
<title>FreeCAD User Manual</title>
<applicationIcon>freecad-icon-64.png</applicationIcon>
<cacheDirectory base="collection">freecad/freecad</cacheDirectory>
<startPage>qthelp://org.freecad.usermanual/doc/Online_Help_Startpage.html</startPage>
<aboutMenuText>
<text>About FreeCAD</text>
</aboutMenuText>
<aboutDialog>
<file>about.txt</file>
<icon>freecad-icon-64.png</icon>
</aboutDialog>
<enableDocumentationManager>true</enableDocumentationManager>
<enableAddressBar>true</enableAddressBar>
<enableFilterFunctionality>true</enableFilterFunctionality>
</assistant>
<docFiles>
<generate>
<file>
<input>freecad.qhp</input>
<output>freecad.qch</output>
</file>
</generate>
<register>
<file>freecad.qch</file>
</register>
</docFiles>
</QHelpCollectionProject>
'''
if VERBOSE: print ("Building project file...")
qfilename = FOLDER + os.sep + "freecad.qhcp"
f = open(qfilename,'w')
f.write(qprojectfile)
f.close()
if VERBOSE: print ("Done writing qhcp file:",qfilename)
return qfilename
def buildtoc():
'''
gets the table of contents page and parses its
contents into a clean lists structure
'''
qhelpfile = '''<?xml version="1.0" encoding="UTF-8"?>
<QtHelpProject version="1.0">
<namespace>org.freecad.usermanual</namespace>
<virtualFolder>doc</virtualFolder>
<!--
<customFilter name="FreeCAD '''+RELEASE+'''">
<filterAttribute>FreeCAD</filterAttribute>
<filterAttribute>'''+RELEASE+'''</filterAttribute>
</customFilter>
-->
<filterSection>
<!--
<filterAttribute>FreeCAD</filterAttribute>
<filterAttribute>'''+RELEASE+'''</filterAttribute>
-->
<toc>
<inserttoc>
</toc>
<keywords>
<insertkeywords>
</keywords>
<insertfiles>
</filterSection>
</QtHelpProject>
'''
def getname(line):
line = re.compile('<li>').sub('',line)
line = re.compile('</li>').sub('',line)
title = line.strip()
link = ''
if "<a" in line:
title = re.findall('<a[^>]*>(.*?)</a>',line)[0].strip()
link = re.findall('href="(.*?)"',line)[0].strip()
if link:
if not link.endswith(".html"):
link = link + ".html"
if link.startswith("/"):
link = link[1:]
if not link: link = 'default.html'
if title.startswith("<img"):
# workbenches
wb = re.findall("Workbench\_(.*?)\.svg",title)[0]
title = wb + " Workbench"
link = wb + "_Workbench.html"
return title,link
if VERBOSE: print ("Building table of contents...")
f = open(FOLDER+os.sep+INDEX+'.html')
html = ''
for line in f: html += line
f.close()
html = html.replace("\n"," ")
html = html.replace("> <","><")
html = re.findall("<ul.*/ul>",html)[0]
items = re.findall('<li[^>]*>.*?</li>|</ul></li>',html)
inserttoc = '<section title="FreeCAD Documentation" ref="Online_Help_Toc.html">\n'
insertkeywords = ''
for item in items:
if not ("<ul>" in item):
if ("</ul>" in item):
inserttoc += ' </section>\n'
else:
link = ''
title,link=getname(item)
if link:
link='" ref="'+link
insertkeywords += ('<keyword name="'+title+link+'"/>\n')
if link and title:
inserttoc += (' <section title="'+title+link+'"></section>\n')
else:
subitems = item.split("<ul>")
for i in range(len(subitems)):
link = ''
title,link=getname(subitems[i])
if link:
link='" ref="'+link
insertkeywords += ('<keyword name="'+title+link+'"/>\n')
trail = ''
if i == len(subitems)-1: trail = '</section>'
if link and title:
inserttoc += (' <section title="'+title+link+'">'+trail+'\n')
inserttoc += '</section>\n'
insertfiles = "<files>\n"
for fil in os.listdir(FOLDER):
insertfiles += ("<file>"+fil+"</file>\n")
insertfiles += "</files>\n"
qhelpfile = re.compile('<insertkeywords>').sub(insertkeywords,qhelpfile)
qhelpfile = re.compile('<inserttoc>').sub(inserttoc,qhelpfile)
qhelpfile = re.compile('<insertfiles>').sub(insertfiles,qhelpfile)
qfilename = FOLDER + os.sep + "freecad.qhp"
f = open(qfilename,'w')
f.write(qhelpfile)
f.close()
if VERBOSE: print ("Done writing qhp file:",qfilename)
return qfilename
if __name__ == "__main__":
crawl()

View File

@@ -1,222 +0,0 @@
#!/usr/bin/env python
#***************************************************************************
#* *
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
#* *
#* This program is free software; you can redistribute it and/or modify *
#* it under the terms of the GNU Lesser General Public License (LGPL) *
#* as published by the Free Software Foundation; either version 2 of *
#* the License, or (at your option) any later version. *
#* for detail see the LICENCE text file. *
#* *
#* This program is distributed in the hope that it will be useful, *
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
#* GNU Library General Public License for more details. *
#* *
#* You should have received a copy of the GNU Library General Public *
#* License along with this program; if not, write to the Free Software *
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
#* USA *
#* *
#***************************************************************************
from __future__ import print_function
__title__="buildwikiindex.py"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://www.freecadweb.org"
"""
This script parses the contents of a wiki site and saves a file containing
names of pages and images to be downloaded.
"""
import sys, os, re
from urllib2 import urlopen, HTTPError
# CONFIGURATION #################################################
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation',
'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds',
'FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
NORETRIEVE += ['Constraint_Concentric','Constraint_EqualLength','Constraint_ExternalAngle',
'Constraint_Horizontal','Constraint_HorizontalDistance','Constraint_Internal_Alignment',
'Constraint_InternalAngle','Constraint_Length','Constraint_Lock','Constraint_Parallel',
'Constraint_Perpendicular','Constraint_PointOnEnd','Constraint_PointOnMidPoint',
'Constraint_PointOnObject','Constraint_PointOnPoint','Constraint_PointOnStart',
'Constraint_PointToObject','Constraint_Radius','Constraint_SnellsLaw',
'Constraint_Symmetric','Constraint_Tangent','Constraint_TangentToEnd',
'Constraint_TangentToStart','Constraint_Vertical',
'Join_Cutout','Join_Embed','Part_BooleanFragment','Part_Sections','Curves_HelicalSweep',
'CurvedShapes_FlyingWingS800','CurvedShapes_HortenHIX','CurvedShapes_SurfaceCut',
'CurvedShapes_InterpolatedMiddle','CurvedShapes_CurvedSegment','Arch_Cell',
'Std_ClippingPlane','Std_AboutQt'] # pages that have been renamed but still dangle around...
GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
WRITETHROUGH = True # if true, fetched files are constantly written to disk, in case of failure.
# END CONFIGURATION ##############################################
wikiindex = "/index.php?title="
def crawl(pagename=[]):
"downloads an entire wiki site"
todolist = []
processed = []
count = 1
if pagename:
if not isinstance(pagename,list):
pagename = [pagename]
todolist = pagename
else:
if os.path.exists("wikifiles.txt"):
f = open("wikifiles.txt","r")
if VERBOSE: print ("Reading existing list...")
for l in f.readlines():
if l.strip() != "":
if VERBOSE: print ("Adding ",l)
processed.append(l.strip())
f.close()
if os.path.exists("todolist.txt"):
f = open("todolist.txt","r")
if VERBOSE: print ("Reading existing todo list...")
for l in f.readlines():
if l.strip() != "":
todolist.append(l.strip())
f.close()
else:
indexpages,imgs = get(INDEX)
todolist.extend(indexpages)
while todolist:
targetpage = todolist.pop()
if (not targetpage in NORETRIEVE):
if VERBOSE: print (count, ": Scanning ", targetpage)
pages,images = get(targetpage)
count += 1
processed.append(targetpage)
processed.extend(images)
if VERBOSE: print ("got",len(pages),"links")
for p in pages:
if (not (p in todolist)) and (not (p in processed)):
todolist.append(p)
if WRITETHROUGH:
writeList(processed)
writeList(todolist,"todolist.txt")
if VERBOSE: print ("Fetched ", count, " pages")
if not WRITETHROUGH:
writeList(processed)
if pagename:
return processed
return 0
def get(page):
"downloads a single page, returns the other pages it links to"
html = fetchpage(page)
html = cleanhtml(html)
pages = getlinks(html)
images = getimagelinks(html)
return pages,images
def cleanhtml(html):
"cleans given html code from dirty script stuff"
html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
html = re.compile('(.*)<div[^>]+column-content+[^>]+>').sub('',html) # stripping before content
html = re.compile('<div[^>]+column-one+[^>]+>.*').sub('',html) # stripping after content
html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
html = re.compile('<div id="mw-normal-catlinks[^>]>.*?</div>').sub('',html) # removing catlinks
html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
html = re.compile('<div class="mw-pt-translate-header.*?</div>').sub('',html) # removing translations links
if not GETTRANSLATIONS:
html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
html = re.compile('<div class="mw-pt-languages.*?</div>').sub('',html) # removing translations links
html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
return html
def getlinks(html):
"returns a list of wikipage links in html file"
global NORETRIEVE
links = re.findall('<a[^>]*>.*?</a>',html)
pages = []
for l in links:
# rg = re.findall('php\?title=(.*)\" title',l)
rg = re.findall('href=.*?php\?title=(.*?)"',l)
if not rg:
rg = re.findall('href="\/wiki\/(.*?)"',l)
if not rg:
rg = re.findall('href=".*?wiki\\.freecadweb\\.org\/(.*?)"',l)
if not rg:
rg = re.findall('href="\/(.*?)"',l)
if "images" in rg:
rg = None
elif "mediawiki" in rg:
rg = None
if rg:
rg = rg[0]
if not "Command_Reference" in rg:
if "#" in rg:
rg = rg.split('#')[0]
if ":" in rg:
NORETRIEVE.append(rg)
if "&" in rg:
NORETRIEVE.append(rg)
if ";" in rg:
NORETRIEVE.append(rg)
if "/" in rg:
if not GETTRANSLATIONS:
NORETRIEVE.append(rg)
if not rg in NORETRIEVE:
pages.append(rg)
print ("got link: ",rg)
return pages
def getimagelinks(html):
"returns a list of image links found in an html file"
imlinks = re.findall('<img.*?src="(.*?)"',html)
imlinks = [l for l in imlinks if not l.startswith("http")] # remove external images
return imlinks
def fetchpage(page):
"retrieves given page from the wiki"
print ("fetching: ",page)
failcount = 0
while failcount < MAXFAIL:
try:
html = (urlopen(URL + wikiindex + page).read())
return html
except HTTPError:
failcount += 1
print ('Error: unable to fetch page ' + page)
sys.exit()
def cleanList(pagelist):
"cleans the list"
npages = []
for p in pagelist:
if not p in npages:
if not "redlink" in p:
npages.append(p)
return npages
def writeList(pages,filename="wikifiles.txt"):
pages = cleanList(pages)
f = open(filename,"wb")
for p in pages:
f.write(p+"\n")
f.close()
if VERBOSE: print ("written ",filename)
if __name__ == "__main__":
crawl(sys.argv[1:])

View File

@@ -1,355 +0,0 @@
#!/usr/bin/env python
#***************************************************************************
#* *
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
#* *
#* This program is free software; you can redistribute it and/or modify *
#* it under the terms of the GNU Lesser General Public License (LGPL) *
#* as published by the Free Software Foundation; either version 2 of *
#* the License, or (at your option) any later version. *
#* for detail see the LICENCE text file. *
#* *
#* This program is distributed in the hope that it will be useful, *
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
#* GNU Library General Public License for more details. *
#* *
#* You should have received a copy of the GNU Library General Public *
#* License along with this program; if not, write to the Free Software *
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
#* USA *
#* *
#***************************************************************************
from __future__ import print_function
__title__="downloadwiki"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://www.freecadweb.org"
"""
This script retrieves the contents of a wiki site from a pages list
"""
import os, re
from urllib2 import urlopen, HTTPError
# CONFIGURATION #################################################
DEFAULTURL = "https://www.freecadweb.org" #default URL if no URL is passed
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation',
'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds',
'FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
NORETRIEVE += ['Constraint_Concentric','Constraint_EqualLength','Constraint_ExternalAngle',
'Constraint_Horizontal','Constraint_HorizontalDistance','Constraint_Internal_Alignment',
'Constraint_InternalAngle','Constraint_Length','Constraint_Lock','Constraint_Parallel',
'Constraint_Perpendicular','Constraint_PointOnEnd','Constraint_PointOnMidPoint',
'Constraint_PointOnObject','Constraint_PointOnPoint','Constraint_PointOnStart',
'Constraint_PointToObject','Constraint_Radius','Constraint_SnellsLaw',
'Constraint_Symmetric','Constraint_Tangent','Constraint_TangentToEnd',
'Constraint_TangentToStart','Constraint_Vertical'] # pages that have been renamed but still dangle around...GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
GETTRANSLATIONS = False # Set true if you want to get the translations too.
# END CONFIGURATION ##############################################
FOLDER = "./localwiki"
LISTFILE = "wikifiles.txt"
URL = DEFAULTURL
wikiindex = "/wiki/index.php?title="
imageprefix = "/wiki/"
defaultfile = "<html><head><link type='text/css' href='wiki.css' rel='stylesheet'></head><body>&nbsp;</body></html>"
css = """/* Basic CSS for offline wiki rendering */
body {
font-family: Fira Sans,Arial,Helvetica,sans-serif;
text-align: justify;
max-width: 800px;
}
h1 {
font-size: 2.4em;
font-weight: bold;
padding: 5px;
border-radius: 5px;
}
h2 {
font-weight: normal;
font-size: 1.6em;
border-bottom: 1px solid #ddd;
}
h3 {
padding-left: 20px;
}
img {
max-width: 100%;
}
li {
margin-top: 10px;
}
pre, .mw-code {
text-align: left;
padding: 5px 5px 5px 20px;
font-family: mono;
border-radius: 2px;
}
a:link, a:visited {
font-weight: bold;
text-decoration: none;
color: #2969C4;
}
a:hover {
text-decoration: underline;
}
.printfooter {
font-size: 0.8em;
color: #333333;
border-top: 1px solid #333;
margin-top: 20px;
}
.wikitable #toc {
font-size: 0.8em;
}
.ct, .ctTitle, .ctOdd, .ctEven th {
font-size: 1em;
text-align: left;
width: 190px;
float: right;
margin-top: 10px;
border-radius: 2px;
}
.ct {
margin-left: 15px;
padding: 10px;
}
#mw-navigation, .mw-jump-link, .docnav, .NavFrame {
display:none; /*TODO remove on next build (included below)*/
}
"""
def crawl():
"downloads an entire wiki site"
global processed
processed = []
if VERBOSE: print ("crawling ", URL, ", saving in ", FOLDER)
if not os.path.isdir(FOLDER): os.mkdir(FOLDER)
file = open(FOLDER + os.sep + "wiki.css",'wb')
file.write(css)
file.close()
dfile = open(FOLDER + os.sep + "default.html",'wb')
dfile.write(defaultfile)
dfile.close()
lfile = open(LISTFILE)
global locallist
locallist = []
for l in lfile: locallist.append(l.replace("\n",""))
lfile.close()
todolist = locallist[:]
print ("getting ",len(todolist)," files...")
count = 1
get(INDEX)
while todolist:
targetpage = todolist.pop()
if VERBOSE: print (count,(3-len(str(count)))*" ", ": Fetching ", targetpage)
get(targetpage)
count += 1
if VERBOSE: print ("Fetched ", count, " pages")
if VERBOSE: print ("All done!")
return 0
def get(page):
"downloads a single page"
localpage = page
if "Command_Reference" in localpage:
localpage = localpage.replace("Category:","")
localpage = localpage.replace("&pagefrom=","+")
localpage = localpage.replace("#mw-pages","")
if page[-4:] in [".png",".jpg",".svg",".gif","jpeg",".PNG",".JPG"]:
fetchimage(page)
elif not exists(localpage):
html = fetchpage(page)
html = cleanhtml(html)
pages = getlinks(html)
html = cleanlinks(html,pages)
html = cleanimagelinks(html)
output(html,page)
else:
if VERBOSE: print (" skipping ",page)
def getlinks(html):
"returns a list of wikipage links in html file"
links = re.findall('<a[^>]*>.*?</a>',html)
pages = []
for l in links:
# rg = re.findall('php\?title=(.*)\" title',l)
rg = re.findall('href=.*?php\?title=(.*?)"',l)
if not rg:
rg = re.findall('href="\/wiki\/(.*?)"',l)
if rg:
rg = rg[0]
if not "Command_Reference" in rg:
if "#" in rg:
rg = rg.split('#')[0]
if ":" in rg:
NORETRIEVE.append(rg)
if ";" in rg:
NORETRIEVE.append(rg)
if "&" in rg:
NORETRIEVE.append(rg)
if "/" in rg:
if not GETTRANSLATIONS:
NORETRIEVE.append(rg)
pages.append(rg)
return pages
def getimagelinks(html):
"returns a list of image links found in an html file"
return re.findall('<img.*?src="(.*?)"',html)
def cleanhtml(html):
"cleans given html code from dirty script stuff"
html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
html = html.replace('\t','') # removing tab marks
html = re.compile('(.*)<div id=\"content+[^>]+>').sub('',html) # stripping before content
html = re.compile('<div id="mw-head+[^>]+>.*').sub('',html) # stripping after content
html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
html = re.compile('<div id="mw-normal-catlinks.*?</div>').sub('',html) # removing catlinks
html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
html = re.compile('<div id="mw-navigation.*?</div>').sub('',html) # removing nav stuff
html = re.compile('<table id="toc.*?</table>').sub('',html) # removing toc
html = re.compile('width=\"100%\" style=\"float: right; width: 230px; margin-left: 1em\"').sub('',html) # removing command box styling
#html = re.compile('<div class="docnav.*?</div>Wlinebreak</div>').sub('',html) # removing docnav
html = re.compile('<div class="mw-pt-translate-header.*?</div>').sub('',html) # removing translations links
if not GETTRANSLATIONS:
html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
html = re.compile('<div class="mw-pt-languages.*?</div>').sub('',html) # removing translations links
html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
return html
def cleanlinks(html, pages=None):
"cleans page links found in html"
if not pages: pages = getlinks(html)
for page in pages:
if page in NORETRIEVE:
output = 'href="' + URL + wikiindex + page + '"'
else:
output = 'href="' + page.replace("/","-") + '.html"'
html = re.compile('href="[^"]+' + page + '"').sub(output,html)
if "Command_Reference" in output:
html = html.replace("Category:","")
html = html.replace("&amp;pagefrom=","+")
html = html.replace("#mw-pages",".html")
html = html.replace("/wiki/index.php?title=Command_Reference","Command_Reference")
return html
def cleanimagelinks(html,links=None):
"cleans image links in given html"
if not links: links = getimagelinks(html)
if links:
for l in links:
nl = re.findall('.*/(.*)',l)
if nl: html = html.replace(l,nl[0])
# fetchimage(l)
return html
def fetchpage(page):
"retrieves given page from the wiki"
print (" downloading: ",URL + wikiindex + page)
failcount = 0
while failcount < MAXFAIL:
try:
html = (urlopen(URL + wikiindex + page).read())
return html
except HTTPError:
failcount += 1
print ('Error: unable to fetch page ' + page)
def fetchimage(imagelink):
"retrieves given image from the wiki and saves it"
if imagelink[0:5] == "File:":
print ("Skipping file page link")
return
filename = re.findall('.*/(.*)',imagelink)[0]
if not exists(filename,image=True):
failcount = 0
while failcount < MAXFAIL:
try:
if VERBOSE: print (" downloading " + URL + imageprefix + imagelink)
data = (urlopen(URL + imageprefix + imagelink)).read()
path = local(filename,image=True)
file = open(path,'wb')
file.write(data)
file.close()
except Exception:
failcount += 1
else:
processed.append(filename)
if VERBOSE: print (" saving ",local(filename,image=True))
return
print ('Error: unable to fetch file ' + filename)
else:
if VERBOSE: print (" skipping ",filename)
def local(page,image=False):
"returns a local path for a given page/image"
if image:
return FOLDER + os.sep + page
else:
return FOLDER + os.sep + page + '.html'
def exists(page,image=False):
"checks if given page/image already exists"
path = local(page.replace("/","-"),image)
if os.path.exists(path): return True
return False
def webroot(url):
return re.findall('(http://.*?)/',url)[0]
def output(html,page):
"encapsulates raw html code into nice html body"
title = page.replace("_"," ")
header = "<html><head>"
header += "<title>" + title + "</title>"
header += '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
header += "<link type='text/css' href='wiki.css' rel='stylesheet'>"
header += "</head><body>"
header += "<h1>" + title + "</h1>"
footer = "</body></html>"
html = header+html+footer
filename = local(page.replace("/","-"))
if "Command_Reference" in filename:
filename = filename.replace("Category:","")
filename = filename.replace("&pagefrom=","+")
filename = filename.replace("#mw-pages","")
filename = filename.replace(".html.html",".html")
print (" saving ",filename)
file = open(filename,'wb')
file.write(html)
file.close()
if __name__ == "__main__":
crawl()

View File

@@ -1,45 +0,0 @@
#!/usr/bin/env python3
#***************************************************************************
#* *
#* Copyright (c) 2021 Yorik van Havre <yorik@uncreated.net> *
#* *
#* This program is free software; you can redistribute it and/or modify *
#* it under the terms of the GNU Lesser General Public License (LGPL) *
#* as published by the Free Software Foundation; either version 2 of *
#* the License, or (at your option) any later version. *
#* for detail see the LICENCE text file. *
#* *
#* This program is distributed in the hope that it will be useful, *
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
#* GNU Library General Public License for more details. *
#* *
#* You should have received a copy of the GNU Library General Public *
#* License along with this program; if not, write to the Free Software *
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
#* USA *
#* *
#***************************************************************************
"""This script fixes links like href="/Arch_Wall" into href="/Arch_Wall.html" where needed. Dirty hack, downloadwiki.py should be fixed instead"""
import os,re
files = [f for f in os.listdir("localwiki") if f.endswith(".html")]
for fn in files:
f = open(os.path.join("localwiki",fn))
b = f.read()
f.close()
b = b.replace("\n","--endl--")
for href in re.findall("href=\".*?\"",b):
if (not "." in href) and (not "#" in href):
repl = href[:-1]+".html\""
if "href=\"/" in repl:
repl = repl.replace("href=\"/","href=\"")
print(fn," : replacing",href,"with",repl)
b = b.replace(href,repl)
b = b.replace("--endl--","\n")
f = open(os.path.join("localwiki",fn),"w")
f.write(b)
f.close()

View File

@@ -1,199 +0,0 @@
#!/usr/bin/env python
#***************************************************************************
#* *
#* Copyright (c) 2017 Yorik van Havre <yorik@uncreated.net> *
#* *
#* This program is free software; you can redistribute it and/or modify *
#* it under the terms of the GNU Lesser General Public License (LGPL) *
#* as published by the Free Software Foundation; either version 2 of *
#* the License, or (at your option) any later version. *
#* for detail see the LICENCE text file. *
#* *
#* This program is distributed in the hope that it will be useful, *
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
#* GNU Library General Public License for more details. *
#* *
#* You should have received a copy of the GNU Library General Public *
#* License along with this program; if not, write to the Free Software *
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
#* USA *
#* *
#***************************************************************************
__title__="update.py"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://www.freecadweb.org"
"""
This script needs to be run after the wiki has been fully downloaded. It has three usages:
1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file
and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file
2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of
wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt.
An update.txt file is created with all pages that have different revision IDs
3) If update.txt exists, each entry of it will be scanned again for new links and all the needed
files downloaded. Revision.txt and wikifiles.txt get also updated.
"""
import sys, os, re
from urllib.request import urlopen
from urllib.error import HTTPError
# CONFIGURATION #################################################
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
# END CONFIGURATION ##############################################
wikiindex = "/index.php?title="
def update(pagename=None):
if not os.path.exists("revisions.txt"): # case 1)
if not os.path.exists("wikifiles.txt"):
print("No wikifiles.txt found. Aborting")
sys.exit()
pages = []
f = open("wikifiles.txt","r")
if VERBOSE: print("Reading existing list...")
for l in f.readlines():
if l.strip() != "":
if not "/wiki/" in l:
if VERBOSE: print("Adding ",l.strip())
pages.append(l.strip())
f.close()
if VERBOSE: print("Added ",str(len(pages))," entries")
i = 1
revs = []
for page in pages:
rev = getRevision(page)
if VERBOSE: print(str(i)," revision: ",rev)
revs.append(page+":"+rev)
i += 1
writeList(revs,"revisions.txt")
print("All done. Successfully written revisions.txt with ",len(revs)," entries.")
elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")): # case 2)
f = open("revisions.txt","r")
if VERBOSE: print("Reading revisions list...")
revisions = {}
for l in f.readlines():
if l.strip() != "":
r = l.strip().split(":")
p = ":".join(r[:-1])
if VERBOSE: print("Adding ",p)
revisions[p] = r[1]
f.close()
if VERBOSE: print("Added ",str(len(list(revisions.keys())))," entries")
updates = []
i = 1
for page in list(revisions.keys()):
rev = getRevision(page)
if rev != revisions[page]:
if VERBOSE: print(str(i),page," has a new revision: ",rev)
updates.append(page)
else:
if VERBOSE: print(str(i),page," is up to date ")
i += 1
if updates:
writeList(updates,"updates.txt")
print("All done. Successfully written updates.txt with ",len(updates)," entries.")
else:
print("Everything up to date. Nothing to be done.")
elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"): # case 3)
if not os.path.exists("wikifiles.txt"):
print("No wikifiles.txt found. Aborting")
sys.exit()
wikifiles = []
f = open("wikifiles.txt","r")
if VERBOSE: print("Reading wikifiles list...")
for l in f.readlines():
if l.strip() != "":
wikifiles.append(l.strip())
f.close()
if VERBOSE: print("Read ",str(len(wikifiles))," entries")
f = open("revisions.txt","r")
if VERBOSE: print("Reading revisions list...")
revisions = {}
for l in f.readlines():
if l.strip() != "":
r = l.strip().split(":")
p = ":".join(r[:-1])
revisions[p] = r[1]
f.close()
todo = []
f = open("updates.txt","r")
if VERBOSE: print("Reading updates list...")
for l in f.readlines():
if l.strip() != "":
todo.append(l.strip())
f.close()
if VERBOSE: print(str(len(todo))," pages to scan...")
import buildwikiindex
buildwikiindex.WRITETHROUGH = False
buildwikiindex.VERBOSE = VERBOSE
updates = []
for t in todo:
if VERBOSE: print("Scanning ",t)
updates.extend(buildwikiindex.crawl(t))
updates = [u for u in updates if not u in wikifiles]
if VERBOSE: print(str(len(updates))," files to download...")
import downloadwiki
i = 1
for u in updates:
if VERBOSE: print(i, ": Fetching ", u)
downloadwiki.get(u)
if not "/wiki/" in u:
rev = getRevision(u)
revisions[u] = rev
if not u in wikifiles:
wikifiles.append(u)
i += 1
if VERBOSE: print("Updating wikifiles and revisions...")
writeList(wikifiles,"wikifiles.txt")
updatedrevs = []
for k in list(revisions.keys()):
updatedrevs.append(k+":"+revisions[k])
writeList(updatedrevs,"revisions.txt")
os.remove("updates.txt")
if VERBOSE: print("All done!")
def getRevision(page):
html = fetchPage(page)
revs = re.findall("wgCurRevisionId\"\:(.*?),",html)
if len(revs) == 1:
return revs[0]
print('Error: unable to get revision ID of ' + page)
sys.exit()
def fetchPage(page):
"retrieves given page from the wiki"
print("fetching: ",page)
failcount = 0
while failcount < MAXFAIL:
try:
html = (urlopen(URL + wikiindex + page).read())
return html
except HTTPError:
failcount += 1
print('Error: unable to fetch page ' + page)
sys.exit()
def writeList(pages,filename):
f = open(filename,"wb")
for p in pages:
f.write(p+"\n")
f.close()
if VERBOSE: print("written ",filename)
if __name__ == "__main__":
update(sys.argv[1:])