Tools: Removing obsolete wiki-to-qhelp scripts
Note: A copy of these scripts is kept at https://github.com/yorikvanhavre/FreeCAD-offlinedoc-scripts for archiving
This commit is contained in:
@@ -1,41 +0,0 @@
|
||||
this suite of tools can be used to retrieve a local copy
|
||||
from the FreeCAD wiki and then use it to generate qhelp
|
||||
and pdf files. The downloading of the entire wiki is now
|
||||
a huge operation, prone to network errors, so it has been
|
||||
cut into 2 parts, one to retrieve a list of files to
|
||||
download and another to actually download the files.
|
||||
|
||||
1) run "buildwikiindex.py" to build an index file containing
|
||||
a list of all the files to download
|
||||
|
||||
2) run "downloadwiki.py". If connection drops, run it again,
|
||||
the already downloaded files will be skipped.
|
||||
|
||||
2b) Dirty hack: run "fixlinks.py" to fix wrong html links
|
||||
(downloadwiki.py should be fixed in the future)
|
||||
|
||||
3) run "buildqhelp.py" to generate freecad.qhc and freecad.qch
|
||||
files
|
||||
|
||||
4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)
|
||||
|
||||
5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
|
||||
|
||||
6) If you have already downloaded the whole wiki, run "update.py" immediately
|
||||
after, to create a list of revision IDs for each page.
|
||||
|
||||
7) Once the initial revisions list has been created, the "update.py" script
|
||||
can be ran anytime in the future, to check for pages that have changed
|
||||
since the stored revision ID. The script is meant to run twice, one to get
|
||||
a list of pages that have changed, and another one to download the changed
|
||||
pages (and all their dependencies) again.
|
||||
|
||||
8) To split the generated freecad.qch into parts that are smaller than 50Mb
|
||||
(github limit): split -d --byte=49M localwiki/freecad.qch localwiki/freecad.qch.part
|
||||
|
||||
9) To join the parts again (for testing): cat localwiki/freecad.qch.part* >> test.qch
|
||||
Then check that test.qch has the same md5 number than localwiki/freecad.qch
|
||||
|
||||
10) To test: assistant -collectionFile localwiki/freecad.qhc
|
||||
|
||||
|
||||
@@ -1,322 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#***************************************************************************
|
||||
#* *
|
||||
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
|
||||
#* *
|
||||
#* This program is free software; you can redistribute it and/or modify *
|
||||
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
||||
#* as published by the Free Software Foundation; either version 2 of *
|
||||
#* the License, or (at your option) any later version. *
|
||||
#* for detail see the LICENCE text file. *
|
||||
#* *
|
||||
#* This program is distributed in the hope that it will be useful, *
|
||||
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
#* GNU Library General Public License for more details. *
|
||||
#* *
|
||||
#* You should have received a copy of the GNU Library General Public *
|
||||
#* License along with this program; if not, write to the Free Software *
|
||||
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
||||
#* USA *
|
||||
#* *
|
||||
#***************************************************************************
|
||||
|
||||
__title__="buildpdf"
|
||||
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
||||
__url__ = "http://www.freecadweb.org"
|
||||
|
||||
"""
|
||||
This script builds a pdf file from a local copy of the wiki
|
||||
"""
|
||||
|
||||
Workbenches=["Part","PartDesign","Sketcher","Constraints","Draft","Path","Fem","Arch","TechDraw","Raytracing","OpenSCAD","Robot","Mesh"]
|
||||
|
||||
TOC="""Online_Help_Startpage
|
||||
About_FreeCAD
|
||||
Feature_list
|
||||
Installing
|
||||
Getting_started
|
||||
Mouse_Model
|
||||
Document_structure
|
||||
Property_editor
|
||||
Import_Export
|
||||
Workbenches
|
||||
|
||||
WorkbenchesList
|
||||
|
||||
Interface_Customization
|
||||
Preferences_Editor
|
||||
Macros
|
||||
Introduction_to_Python
|
||||
Python_scripting_tutorial
|
||||
Topological_data_scripting
|
||||
Mesh_Scripting
|
||||
Mesh_to_Part
|
||||
Scenegraph
|
||||
Pivy
|
||||
|
||||
begin
|
||||
|
||||
PySide
|
||||
PySide_Beginner_Examples
|
||||
PySide_Medium_Examples
|
||||
PySide_Advanced_Examples
|
||||
|
||||
end
|
||||
|
||||
Scripted_objects
|
||||
Embedding_FreeCAD
|
||||
Embedding_FreeCADGui
|
||||
Code_snippets"""
|
||||
|
||||
import sys, os, shutil, time
|
||||
from urllib.request import urlopen
|
||||
|
||||
# CONFIGURATION #################################################
|
||||
|
||||
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
||||
PDFCONVERTOR = 'wkhtmltopdf' # can be 'pisa', 'htmldoc', 'wkhtmltopdf' or 'firefox'
|
||||
VERBOSE = True # set true to get output messages
|
||||
INCLUDECOMMANDS = True # if true, the command pages of each workbench are included after each WB page
|
||||
OVERWRITE = False # if true, pdf files are recreated even if already existing
|
||||
FIREFOXPDFFOLDER = os.path.expanduser("~")+os.sep+"PDF" # if firefox is used, set this to where it places its pdf files by default
|
||||
COVER = "http://www.freecadweb.org/wiki/images/7/79/Freecad-pdf-cover.svg"
|
||||
|
||||
# END CONFIGURATION ##############################################
|
||||
|
||||
|
||||
FOLDER = "./localwiki"
|
||||
|
||||
fcount = dcount = 0
|
||||
|
||||
def crawl():
|
||||
"creates a pdf file from the localwiki folder"
|
||||
|
||||
# tests ###############################################
|
||||
|
||||
if PDFCONVERTOR == 'pisa':
|
||||
try:
|
||||
import ho.pisa as pisa
|
||||
except Exception:
|
||||
"Error: Python-pisa not installed, exiting."
|
||||
return 1
|
||||
elif PDFCONVERTOR == 'htmldoc':
|
||||
if os.system('htmldoc --version'):
|
||||
print("Error: Htmldoc not found, exiting.")
|
||||
return 1
|
||||
try:
|
||||
from PyPDF2 import PdfFileReader,PdfFileWriter
|
||||
except Exception:
|
||||
print("Error: Python-pypdf2 not installed, exiting.")
|
||||
|
||||
# run ########################################################
|
||||
|
||||
buildpdffiles()
|
||||
joinpdf()
|
||||
|
||||
if VERBOSE: print("All done!")
|
||||
return 0
|
||||
|
||||
|
||||
def buildpdffiles():
|
||||
"scans a folder for html files and converts them all to pdf"
|
||||
templist = os.listdir(FOLDER)
|
||||
if PDFCONVERTOR == 'wkhtmltopdf':
|
||||
makeStyleSheet()
|
||||
global fileslist
|
||||
fileslist = []
|
||||
for i in templist:
|
||||
if i[-5:] == '.html':
|
||||
fileslist.append(i)
|
||||
print("converting ",len(fileslist)," pages")
|
||||
i = 1
|
||||
for f in fileslist:
|
||||
print(i," : ",f)
|
||||
if PDFCONVERTOR == 'pisa':
|
||||
createpdf_pisa(f[:-5])
|
||||
elif PDFCONVERTOR == 'wkhtmltopdf':
|
||||
createpdf_wkhtmltopdf(f[:-5])
|
||||
elif PDFCONVERTOR == 'firefox':
|
||||
createpdf_firefox(f[:-5])
|
||||
else:
|
||||
createpdf_htmldoc(f[:-5])
|
||||
i += 1
|
||||
|
||||
|
||||
def fetch_resources(uri, rel):
|
||||
"""
|
||||
Callback to allow pisa/reportlab to retrieve Images,Stylesheets, etc.
|
||||
'uri' is the href attribute from the html link element.
|
||||
'rel' gives a relative path, but it's not used here.
|
||||
|
||||
Note from Yorik: Not working!!
|
||||
"""
|
||||
path = os.path.join(FOLDER,uri.replace("./", ""))
|
||||
return path
|
||||
|
||||
def createpdf_pisa(pagename):
|
||||
"creates a pdf file from a saved page using pisa (python module)"
|
||||
import ho.pisa as pisa
|
||||
if (not exists(pagename+".pdf",image=True)) or OVERWRITE:
|
||||
infile = open(FOLDER + os.sep + pagename+'.html','ro')
|
||||
outfile = open(FOLDER + os.sep + pagename+'.pdf','wb')
|
||||
if VERBOSE: print("Converting " + pagename + " to pdf...")
|
||||
pdf = pisa.CreatePDF(infile,outfile,FOLDER,link_callback=fetch_resources)
|
||||
outfile.close()
|
||||
if pdf.err:
|
||||
return pdf.err
|
||||
return 0
|
||||
|
||||
|
||||
def createpdf_firefox(pagename):
|
||||
"creates a pdf file from a saved page using firefox (needs command line printing extension)"
|
||||
# the default printer will be used, so make sure it is set to pdf
|
||||
# command line printing extension http://forums.mozillazine.org/viewtopic.php?f=38&t=2729795
|
||||
if (not exists(pagename+".pdf",image=True)) or OVERWRITE:
|
||||
infile = FOLDER + os.sep + pagename+'.html'
|
||||
outfile = FOLDER + os.sep + pagename+'.pdf'
|
||||
os.system('firefox -print ' + infile)
|
||||
time.sleep(6)
|
||||
if os.path.exists(FIREFOXPDFFOLDER + os.sep + pagename + ".pdf"):
|
||||
shutil.move(FIREFOXPDFFOLDER+os.sep+pagename+".pdf",outfile)
|
||||
else:
|
||||
print("-----------------------------------------> Couldn't find print output!")
|
||||
|
||||
|
||||
def createpdf_htmldoc(pagename):
|
||||
"creates a pdf file from a saved page using htmldoc (external app, but supports images)"
|
||||
if (not exists(pagename+".pdf",image=True)) or OVERWRITE:
|
||||
infile = FOLDER + os.sep + pagename+'.html'
|
||||
outfile = FOLDER + os.sep + pagename+'.pdf'
|
||||
return os.system('htmldoc --webpage --textfont sans --browserwidth 840 -f '+outfile+' '+infile)
|
||||
|
||||
|
||||
def createpdf_wkhtmltopdf(pagename):
|
||||
"creates a pdf file from a saved page using htmldoc (external app, but supports images)"
|
||||
if (not exists(pagename+".pdf",image=True)) or OVERWRITE:
|
||||
infile = FOLDER + os.sep + pagename+'.html'
|
||||
outfile = FOLDER + os.sep + pagename+'.pdf'
|
||||
cmd = 'wkhtmltopdf -L 5mm --user-style-sheet '+FOLDER+os.sep+'wkhtmltopdf.css '+infile+' '+outfile
|
||||
print(cmd)
|
||||
#return os.system(cmd)
|
||||
else:
|
||||
print("skipping")
|
||||
|
||||
|
||||
def joinpdf():
|
||||
"creates one pdf file from several others, following order from the cover"
|
||||
from PyPDF2 import PdfFileReader,PdfFileWriter
|
||||
if VERBOSE: print("Building table of contents...")
|
||||
|
||||
result = PdfFileWriter()
|
||||
createCover()
|
||||
inputfile = PdfFileReader(open(FOLDER+os.sep+'Cover.pdf','rb'))
|
||||
result.addPage(inputfile.getPage(0))
|
||||
count = 1
|
||||
|
||||
tocfile = createTOC()
|
||||
parent = False
|
||||
for page in tocfile:
|
||||
page = page.strip()
|
||||
if page:
|
||||
if page[0] == "#":
|
||||
continue
|
||||
if page == "begin":
|
||||
parent = True
|
||||
continue
|
||||
if page == "end":
|
||||
parent = False
|
||||
continue
|
||||
if VERBOSE: print('Appending ',page, " at position ",count)
|
||||
title = page.replace("_"," ")
|
||||
pdffile = page + ".pdf"
|
||||
if exists(pdffile,True):
|
||||
inputfile = PdfFileReader(open(FOLDER + os.sep + pdffile,'rb'))
|
||||
numpages = inputfile.getNumPages()
|
||||
for i in range(numpages):
|
||||
result.addPage(inputfile.getPage(i))
|
||||
if parent == True:
|
||||
parent = result.addBookmark(title,count)
|
||||
elif parent == False:
|
||||
result.addBookmark(title,count)
|
||||
else:
|
||||
result.addBookmark(title,count,parent)
|
||||
count += numpages
|
||||
else:
|
||||
print("page ",pdffile," not found, aborting.")
|
||||
sys.exit()
|
||||
|
||||
if VERBOSE: print("Writing...")
|
||||
outputfile = open(FOLDER+os.sep+"freecad.pdf",'wb')
|
||||
result.write(outputfile)
|
||||
outputfile.close()
|
||||
if VERBOSE:
|
||||
print(' ')
|
||||
print('Successfully created '+FOLDER+os.sep+'freecad.pdf')
|
||||
|
||||
|
||||
def createTOC():
|
||||
"populates the TOC"
|
||||
tocfile = TOC.split("\n")
|
||||
files = [f for f in os.listdir(FOLDER) if f.endswith(".pdf")]
|
||||
wbpages = []
|
||||
for wb in Workbenches:
|
||||
wbpage += "begin"
|
||||
if wb+"_Workbench" in files:
|
||||
wbpages.append(wb+"_Workbench")
|
||||
for f in files:
|
||||
if f.lower().startswith(wb.lower()+"_"):
|
||||
if (not f.lower().endswith("_workbench")) and (not f.lower().endswith("tutorial")):
|
||||
wb.append(f)
|
||||
if wb+"_tutorial" in files:
|
||||
wbpages.append(wb+"_tutorial")
|
||||
wbpages.append("end")
|
||||
toc = []
|
||||
for i in tocfile:
|
||||
if i == "WorkbenchesList":
|
||||
toc.extend(wbpages)
|
||||
else:
|
||||
toc.append(i)
|
||||
return toc
|
||||
|
||||
|
||||
def local(page,image=False):
|
||||
"returns a local path for a given page/image"
|
||||
if image:
|
||||
return FOLDER + os.sep + page
|
||||
else:
|
||||
return FOLDER + os.sep + page + '.html'
|
||||
|
||||
|
||||
def exists(page,image=False):
|
||||
"checks if given page/image already exists"
|
||||
path = local(page,image)
|
||||
if os.path.exists(path): return True
|
||||
return False
|
||||
|
||||
|
||||
def makeStyleSheet():
|
||||
"Creates a stylesheet for wkhtmltopdf"
|
||||
outputfile = open(FOLDER+os.sep+"wkhtmltopdf.css",'wb')
|
||||
outputfile.write("""
|
||||
.printfooter {
|
||||
display:none !important;
|
||||
}
|
||||
""")
|
||||
outputfile.close()
|
||||
|
||||
|
||||
def createCover():
|
||||
"downloads and creates a cover page"
|
||||
if VERBOSE: print("fetching " + COVER)
|
||||
data = (urlopen(COVER).read())
|
||||
path = FOLDER + os.sep + "Cover.svg"
|
||||
fil = open(path,'wb')
|
||||
fil.write(data)
|
||||
fil.close()
|
||||
os.system('inkscape --export-pdf='+FOLDER+os.sep+'Cover.pdf'+' '+FOLDER+os.sep+'Cover.svg')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl()
|
||||
@@ -1,246 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#***************************************************************************
|
||||
#* *
|
||||
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
|
||||
#* *
|
||||
#* This program is free software; you can redistribute it and/or modify *
|
||||
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
||||
#* as published by the Free Software Foundation; either version 2 of *
|
||||
#* the License, or (at your option) any later version. *
|
||||
#* for detail see the LICENCE text file. *
|
||||
#* *
|
||||
#* This program is distributed in the hope that it will be useful, *
|
||||
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
#* GNU Library General Public License for more details. *
|
||||
#* *
|
||||
#* You should have received a copy of the GNU Library General Public *
|
||||
#* License along with this program; if not, write to the Free Software *
|
||||
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
||||
#* USA *
|
||||
#* *
|
||||
#***************************************************************************
|
||||
|
||||
__title__="wiki2qhelp"
|
||||
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
||||
__url__ = "http://www.freecadweb.org"
|
||||
|
||||
"""
|
||||
This script builds qhrlp files from a local copy of the wiki
|
||||
"""
|
||||
|
||||
import os, re, shutil
|
||||
|
||||
# CONFIGURATION #################################################
|
||||
|
||||
FOLDER = "./localwiki"
|
||||
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
||||
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
||||
QHELPCOMPILER = 'qhelpgenerator'
|
||||
RELEASE = '0.19'
|
||||
|
||||
# END CONFIGURATION ##############################################
|
||||
|
||||
fcount = dcount = 0
|
||||
|
||||
def crawl():
|
||||
"downloads an entire wiki site"
|
||||
|
||||
# tests ###############################################
|
||||
|
||||
if os.system(QHELPCOMPILER +' -v'):
|
||||
print ("Error: QAssistant not fully installed, exiting.")
|
||||
return 1
|
||||
|
||||
# run ########################################################
|
||||
|
||||
qhp = buildtoc()
|
||||
qhcp = createCollProjectFile()
|
||||
shutil.copy("../../Gui/Icons/freecad-icon-64.png","localwiki/freecad-icon-64.png")
|
||||
if generate(qhcp):
|
||||
print ("Error while generating")
|
||||
return 1
|
||||
if compile(qhp):
|
||||
print ("Error while compiling")
|
||||
return 1
|
||||
if VERBOSE: print ("All done!")
|
||||
#i=raw_input("Copy the files to their correct location in the source tree? y/n (default=no) ")
|
||||
#if i.upper() in ["Y","YES"]:
|
||||
# shutil.copy("localwiki/freecad.qch","../../Doc/freecad.qch")
|
||||
# shutil.copy("localwiki/freecad.qhc","../../Doc/freecad.qhc")
|
||||
#else:
|
||||
print ('Files freecad.qch and freecad.qhc are in localwiki. Test with "assistant -collectionFile localwiki/freecad.qhc"')
|
||||
return 0
|
||||
|
||||
def compile(qhpfile):
|
||||
"compiles the whole html doc with qassistant"
|
||||
qchfile = FOLDER + os.sep + "freecad.qch"
|
||||
if not os.system(QHELPCOMPILER + ' '+qhpfile+' -o '+qchfile):
|
||||
if VERBOSE: print ("Successfully created",qchfile)
|
||||
return 0
|
||||
return 1
|
||||
|
||||
def generate(qhcpfile):
|
||||
"generates qassistant-specific settings like icon, title, ..."
|
||||
txt="""
|
||||
<center>FreeCAD """+RELEASE+""" help files<br/>
|
||||
<a href="http://www.freecadweb.org">http://www.freecadweb.org</a></center>
|
||||
"""
|
||||
about=open(FOLDER + os.sep + "about.txt","w")
|
||||
about.write(txt)
|
||||
about.close()
|
||||
qhcfile = FOLDER + os.sep + "freecad.qhc"
|
||||
if not os.system(QHELPCOMPILER+' '+qhcpfile+' -o '+qhcfile):
|
||||
if VERBOSE: print ("Successfully created ",qhcfile)
|
||||
return 0
|
||||
return 1
|
||||
|
||||
def createCollProjectFile():
|
||||
qprojectfile = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<QHelpCollectionProject version="1.0">
|
||||
<assistant>
|
||||
<title>FreeCAD User Manual</title>
|
||||
<applicationIcon>freecad-icon-64.png</applicationIcon>
|
||||
<cacheDirectory base="collection">freecad/freecad</cacheDirectory>
|
||||
<startPage>qthelp://org.freecad.usermanual/doc/Online_Help_Startpage.html</startPage>
|
||||
<aboutMenuText>
|
||||
<text>About FreeCAD</text>
|
||||
</aboutMenuText>
|
||||
<aboutDialog>
|
||||
<file>about.txt</file>
|
||||
<icon>freecad-icon-64.png</icon>
|
||||
</aboutDialog>
|
||||
<enableDocumentationManager>true</enableDocumentationManager>
|
||||
<enableAddressBar>true</enableAddressBar>
|
||||
<enableFilterFunctionality>true</enableFilterFunctionality>
|
||||
</assistant>
|
||||
<docFiles>
|
||||
<generate>
|
||||
<file>
|
||||
<input>freecad.qhp</input>
|
||||
<output>freecad.qch</output>
|
||||
</file>
|
||||
</generate>
|
||||
<register>
|
||||
<file>freecad.qch</file>
|
||||
</register>
|
||||
</docFiles>
|
||||
</QHelpCollectionProject>
|
||||
'''
|
||||
if VERBOSE: print ("Building project file...")
|
||||
qfilename = FOLDER + os.sep + "freecad.qhcp"
|
||||
f = open(qfilename,'w')
|
||||
f.write(qprojectfile)
|
||||
f.close()
|
||||
if VERBOSE: print ("Done writing qhcp file:",qfilename)
|
||||
return qfilename
|
||||
|
||||
def buildtoc():
|
||||
'''
|
||||
gets the table of contents page and parses its
|
||||
contents into a clean lists structure
|
||||
'''
|
||||
|
||||
qhelpfile = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<QtHelpProject version="1.0">
|
||||
<namespace>org.freecad.usermanual</namespace>
|
||||
<virtualFolder>doc</virtualFolder>
|
||||
<!--
|
||||
<customFilter name="FreeCAD '''+RELEASE+'''">
|
||||
<filterAttribute>FreeCAD</filterAttribute>
|
||||
<filterAttribute>'''+RELEASE+'''</filterAttribute>
|
||||
</customFilter>
|
||||
-->
|
||||
<filterSection>
|
||||
<!--
|
||||
<filterAttribute>FreeCAD</filterAttribute>
|
||||
<filterAttribute>'''+RELEASE+'''</filterAttribute>
|
||||
-->
|
||||
<toc>
|
||||
<inserttoc>
|
||||
</toc>
|
||||
<keywords>
|
||||
<insertkeywords>
|
||||
</keywords>
|
||||
<insertfiles>
|
||||
</filterSection>
|
||||
</QtHelpProject>
|
||||
'''
|
||||
|
||||
def getname(line):
|
||||
line = re.compile('<li>').sub('',line)
|
||||
line = re.compile('</li>').sub('',line)
|
||||
title = line.strip()
|
||||
link = ''
|
||||
if "<a" in line:
|
||||
title = re.findall('<a[^>]*>(.*?)</a>',line)[0].strip()
|
||||
link = re.findall('href="(.*?)"',line)[0].strip()
|
||||
if link:
|
||||
if not link.endswith(".html"):
|
||||
link = link + ".html"
|
||||
if link.startswith("/"):
|
||||
link = link[1:]
|
||||
if not link: link = 'default.html'
|
||||
if title.startswith("<img"):
|
||||
# workbenches
|
||||
wb = re.findall("Workbench\_(.*?)\.svg",title)[0]
|
||||
title = wb + " Workbench"
|
||||
link = wb + "_Workbench.html"
|
||||
return title,link
|
||||
|
||||
if VERBOSE: print ("Building table of contents...")
|
||||
f = open(FOLDER+os.sep+INDEX+'.html')
|
||||
html = ''
|
||||
for line in f: html += line
|
||||
f.close()
|
||||
html = html.replace("\n"," ")
|
||||
html = html.replace("> <","><")
|
||||
html = re.findall("<ul.*/ul>",html)[0]
|
||||
items = re.findall('<li[^>]*>.*?</li>|</ul></li>',html)
|
||||
inserttoc = '<section title="FreeCAD Documentation" ref="Online_Help_Toc.html">\n'
|
||||
insertkeywords = ''
|
||||
for item in items:
|
||||
if not ("<ul>" in item):
|
||||
if ("</ul>" in item):
|
||||
inserttoc += ' </section>\n'
|
||||
else:
|
||||
link = ''
|
||||
title,link=getname(item)
|
||||
if link:
|
||||
link='" ref="'+link
|
||||
insertkeywords += ('<keyword name="'+title+link+'"/>\n')
|
||||
if link and title:
|
||||
inserttoc += (' <section title="'+title+link+'"></section>\n')
|
||||
else:
|
||||
subitems = item.split("<ul>")
|
||||
for i in range(len(subitems)):
|
||||
link = ''
|
||||
title,link=getname(subitems[i])
|
||||
if link:
|
||||
link='" ref="'+link
|
||||
insertkeywords += ('<keyword name="'+title+link+'"/>\n')
|
||||
trail = ''
|
||||
if i == len(subitems)-1: trail = '</section>'
|
||||
if link and title:
|
||||
inserttoc += (' <section title="'+title+link+'">'+trail+'\n')
|
||||
inserttoc += '</section>\n'
|
||||
|
||||
insertfiles = "<files>\n"
|
||||
for fil in os.listdir(FOLDER):
|
||||
insertfiles += ("<file>"+fil+"</file>\n")
|
||||
insertfiles += "</files>\n"
|
||||
|
||||
qhelpfile = re.compile('<insertkeywords>').sub(insertkeywords,qhelpfile)
|
||||
qhelpfile = re.compile('<inserttoc>').sub(inserttoc,qhelpfile)
|
||||
qhelpfile = re.compile('<insertfiles>').sub(insertfiles,qhelpfile)
|
||||
qfilename = FOLDER + os.sep + "freecad.qhp"
|
||||
f = open(qfilename,'w')
|
||||
f.write(qhelpfile)
|
||||
f.close()
|
||||
if VERBOSE: print ("Done writing qhp file:",qfilename)
|
||||
return qfilename
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl()
|
||||
|
||||
@@ -1,222 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#***************************************************************************
|
||||
#* *
|
||||
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
|
||||
#* *
|
||||
#* This program is free software; you can redistribute it and/or modify *
|
||||
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
||||
#* as published by the Free Software Foundation; either version 2 of *
|
||||
#* the License, or (at your option) any later version. *
|
||||
#* for detail see the LICENCE text file. *
|
||||
#* *
|
||||
#* This program is distributed in the hope that it will be useful, *
|
||||
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
#* GNU Library General Public License for more details. *
|
||||
#* *
|
||||
#* You should have received a copy of the GNU Library General Public *
|
||||
#* License along with this program; if not, write to the Free Software *
|
||||
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
||||
#* USA *
|
||||
#* *
|
||||
#***************************************************************************
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
__title__="buildwikiindex.py"
|
||||
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
||||
__url__ = "http://www.freecadweb.org"
|
||||
|
||||
"""
|
||||
This script parses the contents of a wiki site and saves a file containing
|
||||
names of pages and images to be downloaded.
|
||||
"""
|
||||
|
||||
import sys, os, re
|
||||
from urllib2 import urlopen, HTTPError
|
||||
|
||||
# CONFIGURATION #################################################
|
||||
|
||||
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
||||
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation',
|
||||
'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds',
|
||||
'FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
|
||||
NORETRIEVE += ['Constraint_Concentric','Constraint_EqualLength','Constraint_ExternalAngle',
|
||||
'Constraint_Horizontal','Constraint_HorizontalDistance','Constraint_Internal_Alignment',
|
||||
'Constraint_InternalAngle','Constraint_Length','Constraint_Lock','Constraint_Parallel',
|
||||
'Constraint_Perpendicular','Constraint_PointOnEnd','Constraint_PointOnMidPoint',
|
||||
'Constraint_PointOnObject','Constraint_PointOnPoint','Constraint_PointOnStart',
|
||||
'Constraint_PointToObject','Constraint_Radius','Constraint_SnellsLaw',
|
||||
'Constraint_Symmetric','Constraint_Tangent','Constraint_TangentToEnd',
|
||||
'Constraint_TangentToStart','Constraint_Vertical',
|
||||
'Join_Cutout','Join_Embed','Part_BooleanFragment','Part_Sections','Curves_HelicalSweep',
|
||||
'CurvedShapes_FlyingWingS800','CurvedShapes_HortenHIX','CurvedShapes_SurfaceCut',
|
||||
'CurvedShapes_InterpolatedMiddle','CurvedShapes_CurvedSegment','Arch_Cell',
|
||||
'Std_ClippingPlane','Std_AboutQt'] # pages that have been renamed but still dangle around...
|
||||
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||
MAXFAIL = 3 # max number of retries if download fails
|
||||
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
||||
WRITETHROUGH = True # if true, fetched files are constantly written to disk, in case of failure.
|
||||
|
||||
# END CONFIGURATION ##############################################
|
||||
|
||||
wikiindex = "/index.php?title="
|
||||
|
||||
def crawl(pagename=[]):
|
||||
"downloads an entire wiki site"
|
||||
todolist = []
|
||||
processed = []
|
||||
count = 1
|
||||
if pagename:
|
||||
if not isinstance(pagename,list):
|
||||
pagename = [pagename]
|
||||
todolist = pagename
|
||||
else:
|
||||
if os.path.exists("wikifiles.txt"):
|
||||
f = open("wikifiles.txt","r")
|
||||
if VERBOSE: print ("Reading existing list...")
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
if VERBOSE: print ("Adding ",l)
|
||||
processed.append(l.strip())
|
||||
f.close()
|
||||
if os.path.exists("todolist.txt"):
|
||||
f = open("todolist.txt","r")
|
||||
if VERBOSE: print ("Reading existing todo list...")
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
todolist.append(l.strip())
|
||||
f.close()
|
||||
else:
|
||||
indexpages,imgs = get(INDEX)
|
||||
todolist.extend(indexpages)
|
||||
while todolist:
|
||||
targetpage = todolist.pop()
|
||||
if (not targetpage in NORETRIEVE):
|
||||
if VERBOSE: print (count, ": Scanning ", targetpage)
|
||||
pages,images = get(targetpage)
|
||||
count += 1
|
||||
processed.append(targetpage)
|
||||
processed.extend(images)
|
||||
if VERBOSE: print ("got",len(pages),"links")
|
||||
for p in pages:
|
||||
if (not (p in todolist)) and (not (p in processed)):
|
||||
todolist.append(p)
|
||||
if WRITETHROUGH:
|
||||
writeList(processed)
|
||||
writeList(todolist,"todolist.txt")
|
||||
if VERBOSE: print ("Fetched ", count, " pages")
|
||||
if not WRITETHROUGH:
|
||||
writeList(processed)
|
||||
if pagename:
|
||||
return processed
|
||||
return 0
|
||||
|
||||
def get(page):
|
||||
"downloads a single page, returns the other pages it links to"
|
||||
html = fetchpage(page)
|
||||
html = cleanhtml(html)
|
||||
pages = getlinks(html)
|
||||
images = getimagelinks(html)
|
||||
return pages,images
|
||||
|
||||
def cleanhtml(html):
|
||||
"cleans given html code from dirty script stuff"
|
||||
html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
|
||||
html = re.compile('(.*)<div[^>]+column-content+[^>]+>').sub('',html) # stripping before content
|
||||
html = re.compile('<div[^>]+column-one+[^>]+>.*').sub('',html) # stripping after content
|
||||
html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
|
||||
html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
|
||||
html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
|
||||
html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
|
||||
html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
|
||||
html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
|
||||
html = re.compile('<div id="mw-normal-catlinks[^>]>.*?</div>').sub('',html) # removing catlinks
|
||||
html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
|
||||
html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
|
||||
html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
|
||||
html = re.compile('<div class="mw-pt-translate-header.*?</div>').sub('',html) # removing translations links
|
||||
if not GETTRANSLATIONS:
|
||||
html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
|
||||
html = re.compile('<div class="mw-pt-languages.*?</div>').sub('',html) # removing translations links
|
||||
html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
|
||||
return html
|
||||
|
||||
def getlinks(html):
|
||||
"returns a list of wikipage links in html file"
|
||||
global NORETRIEVE
|
||||
links = re.findall('<a[^>]*>.*?</a>',html)
|
||||
pages = []
|
||||
for l in links:
|
||||
# rg = re.findall('php\?title=(.*)\" title',l)
|
||||
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
||||
if not rg:
|
||||
rg = re.findall('href="\/wiki\/(.*?)"',l)
|
||||
if not rg:
|
||||
rg = re.findall('href=".*?wiki\\.freecadweb\\.org\/(.*?)"',l)
|
||||
if not rg:
|
||||
rg = re.findall('href="\/(.*?)"',l)
|
||||
if "images" in rg:
|
||||
rg = None
|
||||
elif "mediawiki" in rg:
|
||||
rg = None
|
||||
if rg:
|
||||
rg = rg[0]
|
||||
if not "Command_Reference" in rg:
|
||||
if "#" in rg:
|
||||
rg = rg.split('#')[0]
|
||||
if ":" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if "&" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if ";" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if "/" in rg:
|
||||
if not GETTRANSLATIONS:
|
||||
NORETRIEVE.append(rg)
|
||||
if not rg in NORETRIEVE:
|
||||
pages.append(rg)
|
||||
print ("got link: ",rg)
|
||||
return pages
|
||||
|
||||
def getimagelinks(html):
|
||||
"returns a list of image links found in an html file"
|
||||
imlinks = re.findall('<img.*?src="(.*?)"',html)
|
||||
imlinks = [l for l in imlinks if not l.startswith("http")] # remove external images
|
||||
return imlinks
|
||||
|
||||
def fetchpage(page):
|
||||
"retrieves given page from the wiki"
|
||||
print ("fetching: ",page)
|
||||
failcount = 0
|
||||
while failcount < MAXFAIL:
|
||||
try:
|
||||
html = (urlopen(URL + wikiindex + page).read())
|
||||
return html
|
||||
except HTTPError:
|
||||
failcount += 1
|
||||
print ('Error: unable to fetch page ' + page)
|
||||
sys.exit()
|
||||
|
||||
def cleanList(pagelist):
|
||||
"cleans the list"
|
||||
npages = []
|
||||
for p in pagelist:
|
||||
if not p in npages:
|
||||
if not "redlink" in p:
|
||||
npages.append(p)
|
||||
return npages
|
||||
|
||||
def writeList(pages,filename="wikifiles.txt"):
|
||||
pages = cleanList(pages)
|
||||
f = open(filename,"wb")
|
||||
for p in pages:
|
||||
f.write(p+"\n")
|
||||
f.close()
|
||||
if VERBOSE: print ("written ",filename)
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl(sys.argv[1:])
|
||||
|
||||
@@ -1,355 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#***************************************************************************
|
||||
#* *
|
||||
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
|
||||
#* *
|
||||
#* This program is free software; you can redistribute it and/or modify *
|
||||
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
||||
#* as published by the Free Software Foundation; either version 2 of *
|
||||
#* the License, or (at your option) any later version. *
|
||||
#* for detail see the LICENCE text file. *
|
||||
#* *
|
||||
#* This program is distributed in the hope that it will be useful, *
|
||||
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
#* GNU Library General Public License for more details. *
|
||||
#* *
|
||||
#* You should have received a copy of the GNU Library General Public *
|
||||
#* License along with this program; if not, write to the Free Software *
|
||||
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
||||
#* USA *
|
||||
#* *
|
||||
#***************************************************************************
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
__title__="downloadwiki"
|
||||
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
||||
__url__ = "http://www.freecadweb.org"
|
||||
|
||||
"""
|
||||
This script retrieves the contents of a wiki site from a pages list
|
||||
"""
|
||||
|
||||
import os, re
|
||||
from urllib2 import urlopen, HTTPError
|
||||
|
||||
# CONFIGURATION #################################################
|
||||
|
||||
DEFAULTURL = "https://www.freecadweb.org" #default URL if no URL is passed
|
||||
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
||||
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation',
|
||||
'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds',
|
||||
'FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
|
||||
NORETRIEVE += ['Constraint_Concentric','Constraint_EqualLength','Constraint_ExternalAngle',
|
||||
'Constraint_Horizontal','Constraint_HorizontalDistance','Constraint_Internal_Alignment',
|
||||
'Constraint_InternalAngle','Constraint_Length','Constraint_Lock','Constraint_Parallel',
|
||||
'Constraint_Perpendicular','Constraint_PointOnEnd','Constraint_PointOnMidPoint',
|
||||
'Constraint_PointOnObject','Constraint_PointOnPoint','Constraint_PointOnStart',
|
||||
'Constraint_PointToObject','Constraint_Radius','Constraint_SnellsLaw',
|
||||
'Constraint_Symmetric','Constraint_Tangent','Constraint_TangentToEnd',
|
||||
'Constraint_TangentToStart','Constraint_Vertical'] # pages that have been renamed but still dangle around...GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||
MAXFAIL = 3 # max number of retries if download fails
|
||||
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
||||
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||
|
||||
# END CONFIGURATION ##############################################
|
||||
|
||||
FOLDER = "./localwiki"
|
||||
LISTFILE = "wikifiles.txt"
|
||||
URL = DEFAULTURL
|
||||
wikiindex = "/wiki/index.php?title="
|
||||
imageprefix = "/wiki/"
|
||||
defaultfile = "<html><head><link type='text/css' href='wiki.css' rel='stylesheet'></head><body> </body></html>"
|
||||
css = """/* Basic CSS for offline wiki rendering */
|
||||
|
||||
body {
|
||||
font-family: Fira Sans,Arial,Helvetica,sans-serif;
|
||||
text-align: justify;
|
||||
max-width: 800px;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 2.4em;
|
||||
font-weight: bold;
|
||||
padding: 5px;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-weight: normal;
|
||||
font-size: 1.6em;
|
||||
border-bottom: 1px solid #ddd;
|
||||
}
|
||||
|
||||
h3 {
|
||||
padding-left: 20px;
|
||||
}
|
||||
|
||||
img {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
li {
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
pre, .mw-code {
|
||||
text-align: left;
|
||||
padding: 5px 5px 5px 20px;
|
||||
font-family: mono;
|
||||
border-radius: 2px;
|
||||
}
|
||||
|
||||
a:link, a:visited {
|
||||
font-weight: bold;
|
||||
text-decoration: none;
|
||||
color: #2969C4;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.printfooter {
|
||||
font-size: 0.8em;
|
||||
color: #333333;
|
||||
border-top: 1px solid #333;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.wikitable #toc {
|
||||
font-size: 0.8em;
|
||||
}
|
||||
|
||||
.ct, .ctTitle, .ctOdd, .ctEven th {
|
||||
font-size: 1em;
|
||||
text-align: left;
|
||||
width: 190px;
|
||||
float: right;
|
||||
margin-top: 10px;
|
||||
border-radius: 2px;
|
||||
}
|
||||
|
||||
.ct {
|
||||
margin-left: 15px;
|
||||
padding: 10px;
|
||||
}
|
||||
#mw-navigation, .mw-jump-link, .docnav, .NavFrame {
|
||||
display:none; /*TODO remove on next build (included below)*/
|
||||
}
|
||||
"""
|
||||
|
||||
def crawl():
|
||||
"downloads an entire wiki site"
|
||||
global processed
|
||||
processed = []
|
||||
if VERBOSE: print ("crawling ", URL, ", saving in ", FOLDER)
|
||||
if not os.path.isdir(FOLDER): os.mkdir(FOLDER)
|
||||
file = open(FOLDER + os.sep + "wiki.css",'wb')
|
||||
file.write(css)
|
||||
file.close()
|
||||
dfile = open(FOLDER + os.sep + "default.html",'wb')
|
||||
dfile.write(defaultfile)
|
||||
dfile.close()
|
||||
lfile = open(LISTFILE)
|
||||
global locallist
|
||||
locallist = []
|
||||
for l in lfile: locallist.append(l.replace("\n",""))
|
||||
lfile.close()
|
||||
todolist = locallist[:]
|
||||
print ("getting ",len(todolist)," files...")
|
||||
count = 1
|
||||
get(INDEX)
|
||||
while todolist:
|
||||
targetpage = todolist.pop()
|
||||
if VERBOSE: print (count,(3-len(str(count)))*" ", ": Fetching ", targetpage)
|
||||
get(targetpage)
|
||||
count += 1
|
||||
if VERBOSE: print ("Fetched ", count, " pages")
|
||||
if VERBOSE: print ("All done!")
|
||||
return 0
|
||||
|
||||
def get(page):
|
||||
"downloads a single page"
|
||||
localpage = page
|
||||
if "Command_Reference" in localpage:
|
||||
localpage = localpage.replace("Category:","")
|
||||
localpage = localpage.replace("&pagefrom=","+")
|
||||
localpage = localpage.replace("#mw-pages","")
|
||||
if page[-4:] in [".png",".jpg",".svg",".gif","jpeg",".PNG",".JPG"]:
|
||||
fetchimage(page)
|
||||
elif not exists(localpage):
|
||||
html = fetchpage(page)
|
||||
html = cleanhtml(html)
|
||||
pages = getlinks(html)
|
||||
html = cleanlinks(html,pages)
|
||||
html = cleanimagelinks(html)
|
||||
output(html,page)
|
||||
else:
|
||||
if VERBOSE: print (" skipping ",page)
|
||||
|
||||
def getlinks(html):
|
||||
"returns a list of wikipage links in html file"
|
||||
links = re.findall('<a[^>]*>.*?</a>',html)
|
||||
pages = []
|
||||
for l in links:
|
||||
# rg = re.findall('php\?title=(.*)\" title',l)
|
||||
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
||||
if not rg:
|
||||
rg = re.findall('href="\/wiki\/(.*?)"',l)
|
||||
if rg:
|
||||
rg = rg[0]
|
||||
if not "Command_Reference" in rg:
|
||||
if "#" in rg:
|
||||
rg = rg.split('#')[0]
|
||||
if ":" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if ";" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if "&" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if "/" in rg:
|
||||
if not GETTRANSLATIONS:
|
||||
NORETRIEVE.append(rg)
|
||||
pages.append(rg)
|
||||
return pages
|
||||
|
||||
def getimagelinks(html):
|
||||
"returns a list of image links found in an html file"
|
||||
return re.findall('<img.*?src="(.*?)"',html)
|
||||
|
||||
def cleanhtml(html):
|
||||
"cleans given html code from dirty script stuff"
|
||||
html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
|
||||
html = html.replace('\t','') # removing tab marks
|
||||
html = re.compile('(.*)<div id=\"content+[^>]+>').sub('',html) # stripping before content
|
||||
html = re.compile('<div id="mw-head+[^>]+>.*').sub('',html) # stripping after content
|
||||
html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
|
||||
html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
|
||||
html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
|
||||
html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
|
||||
html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
|
||||
html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
|
||||
html = re.compile('<div id="mw-normal-catlinks.*?</div>').sub('',html) # removing catlinks
|
||||
html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
|
||||
html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
|
||||
html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
|
||||
html = re.compile('<div id="mw-navigation.*?</div>').sub('',html) # removing nav stuff
|
||||
html = re.compile('<table id="toc.*?</table>').sub('',html) # removing toc
|
||||
html = re.compile('width=\"100%\" style=\"float: right; width: 230px; margin-left: 1em\"').sub('',html) # removing command box styling
|
||||
#html = re.compile('<div class="docnav.*?</div>Wlinebreak</div>').sub('',html) # removing docnav
|
||||
html = re.compile('<div class="mw-pt-translate-header.*?</div>').sub('',html) # removing translations links
|
||||
if not GETTRANSLATIONS:
|
||||
html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
|
||||
html = re.compile('<div class="mw-pt-languages.*?</div>').sub('',html) # removing translations links
|
||||
html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
|
||||
return html
|
||||
|
||||
|
||||
def cleanlinks(html, pages=None):
|
||||
"cleans page links found in html"
|
||||
if not pages: pages = getlinks(html)
|
||||
for page in pages:
|
||||
if page in NORETRIEVE:
|
||||
output = 'href="' + URL + wikiindex + page + '"'
|
||||
else:
|
||||
output = 'href="' + page.replace("/","-") + '.html"'
|
||||
html = re.compile('href="[^"]+' + page + '"').sub(output,html)
|
||||
if "Command_Reference" in output:
|
||||
html = html.replace("Category:","")
|
||||
html = html.replace("&pagefrom=","+")
|
||||
html = html.replace("#mw-pages",".html")
|
||||
html = html.replace("/wiki/index.php?title=Command_Reference","Command_Reference")
|
||||
return html
|
||||
|
||||
def cleanimagelinks(html,links=None):
|
||||
"cleans image links in given html"
|
||||
if not links: links = getimagelinks(html)
|
||||
if links:
|
||||
for l in links:
|
||||
nl = re.findall('.*/(.*)',l)
|
||||
if nl: html = html.replace(l,nl[0])
|
||||
# fetchimage(l)
|
||||
return html
|
||||
|
||||
def fetchpage(page):
|
||||
"retrieves given page from the wiki"
|
||||
print (" downloading: ",URL + wikiindex + page)
|
||||
failcount = 0
|
||||
while failcount < MAXFAIL:
|
||||
try:
|
||||
html = (urlopen(URL + wikiindex + page).read())
|
||||
return html
|
||||
except HTTPError:
|
||||
failcount += 1
|
||||
print ('Error: unable to fetch page ' + page)
|
||||
|
||||
def fetchimage(imagelink):
|
||||
"retrieves given image from the wiki and saves it"
|
||||
if imagelink[0:5] == "File:":
|
||||
print ("Skipping file page link")
|
||||
return
|
||||
filename = re.findall('.*/(.*)',imagelink)[0]
|
||||
if not exists(filename,image=True):
|
||||
failcount = 0
|
||||
while failcount < MAXFAIL:
|
||||
try:
|
||||
if VERBOSE: print (" downloading " + URL + imageprefix + imagelink)
|
||||
data = (urlopen(URL + imageprefix + imagelink)).read()
|
||||
path = local(filename,image=True)
|
||||
file = open(path,'wb')
|
||||
file.write(data)
|
||||
file.close()
|
||||
except Exception:
|
||||
failcount += 1
|
||||
else:
|
||||
processed.append(filename)
|
||||
if VERBOSE: print (" saving ",local(filename,image=True))
|
||||
return
|
||||
print ('Error: unable to fetch file ' + filename)
|
||||
else:
|
||||
if VERBOSE: print (" skipping ",filename)
|
||||
|
||||
def local(page,image=False):
|
||||
"returns a local path for a given page/image"
|
||||
if image:
|
||||
return FOLDER + os.sep + page
|
||||
else:
|
||||
return FOLDER + os.sep + page + '.html'
|
||||
|
||||
def exists(page,image=False):
|
||||
"checks if given page/image already exists"
|
||||
path = local(page.replace("/","-"),image)
|
||||
if os.path.exists(path): return True
|
||||
return False
|
||||
|
||||
def webroot(url):
|
||||
return re.findall('(http://.*?)/',url)[0]
|
||||
|
||||
def output(html,page):
|
||||
"encapsulates raw html code into nice html body"
|
||||
title = page.replace("_"," ")
|
||||
header = "<html><head>"
|
||||
header += "<title>" + title + "</title>"
|
||||
header += '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
|
||||
header += "<link type='text/css' href='wiki.css' rel='stylesheet'>"
|
||||
header += "</head><body>"
|
||||
header += "<h1>" + title + "</h1>"
|
||||
footer = "</body></html>"
|
||||
html = header+html+footer
|
||||
filename = local(page.replace("/","-"))
|
||||
if "Command_Reference" in filename:
|
||||
filename = filename.replace("Category:","")
|
||||
filename = filename.replace("&pagefrom=","+")
|
||||
filename = filename.replace("#mw-pages","")
|
||||
filename = filename.replace(".html.html",".html")
|
||||
print (" saving ",filename)
|
||||
file = open(filename,'wb')
|
||||
file.write(html)
|
||||
file.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl()
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#***************************************************************************
|
||||
#* *
|
||||
#* Copyright (c) 2021 Yorik van Havre <yorik@uncreated.net> *
|
||||
#* *
|
||||
#* This program is free software; you can redistribute it and/or modify *
|
||||
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
||||
#* as published by the Free Software Foundation; either version 2 of *
|
||||
#* the License, or (at your option) any later version. *
|
||||
#* for detail see the LICENCE text file. *
|
||||
#* *
|
||||
#* This program is distributed in the hope that it will be useful, *
|
||||
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
#* GNU Library General Public License for more details. *
|
||||
#* *
|
||||
#* You should have received a copy of the GNU Library General Public *
|
||||
#* License along with this program; if not, write to the Free Software *
|
||||
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
||||
#* USA *
|
||||
#* *
|
||||
#***************************************************************************
|
||||
|
||||
"""This script fixes links like href="/Arch_Wall" into href="/Arch_Wall.html" where needed. Dirty hack, downloadwiki.py should be fixed instead"""
|
||||
|
||||
import os,re
|
||||
files = [f for f in os.listdir("localwiki") if f.endswith(".html")]
|
||||
for fn in files:
|
||||
f = open(os.path.join("localwiki",fn))
|
||||
b = f.read()
|
||||
f.close()
|
||||
b = b.replace("\n","--endl--")
|
||||
for href in re.findall("href=\".*?\"",b):
|
||||
if (not "." in href) and (not "#" in href):
|
||||
repl = href[:-1]+".html\""
|
||||
if "href=\"/" in repl:
|
||||
repl = repl.replace("href=\"/","href=\"")
|
||||
print(fn," : replacing",href,"with",repl)
|
||||
b = b.replace(href,repl)
|
||||
b = b.replace("--endl--","\n")
|
||||
f = open(os.path.join("localwiki",fn),"w")
|
||||
f.write(b)
|
||||
f.close()
|
||||
|
||||
@@ -1,199 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#***************************************************************************
|
||||
#* *
|
||||
#* Copyright (c) 2017 Yorik van Havre <yorik@uncreated.net> *
|
||||
#* *
|
||||
#* This program is free software; you can redistribute it and/or modify *
|
||||
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
||||
#* as published by the Free Software Foundation; either version 2 of *
|
||||
#* the License, or (at your option) any later version. *
|
||||
#* for detail see the LICENCE text file. *
|
||||
#* *
|
||||
#* This program is distributed in the hope that it will be useful, *
|
||||
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
#* GNU Library General Public License for more details. *
|
||||
#* *
|
||||
#* You should have received a copy of the GNU Library General Public *
|
||||
#* License along with this program; if not, write to the Free Software *
|
||||
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
||||
#* USA *
|
||||
#* *
|
||||
#***************************************************************************
|
||||
|
||||
__title__="update.py"
|
||||
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
||||
__url__ = "http://www.freecadweb.org"
|
||||
|
||||
"""
|
||||
This script needs to be run after the wiki has been fully downloaded. It has three usages:
|
||||
|
||||
1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file
|
||||
and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file
|
||||
|
||||
2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of
|
||||
wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt.
|
||||
An update.txt file is created with all pages that have different revision IDs
|
||||
|
||||
3) If update.txt exists, each entry of it will be scanned again for new links and all the needed
|
||||
files downloaded. Revision.txt and wikifiles.txt get also updated.
|
||||
"""
|
||||
|
||||
import sys, os, re
|
||||
from urllib.request import urlopen
|
||||
from urllib.error import HTTPError
|
||||
|
||||
# CONFIGURATION #################################################
|
||||
|
||||
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||
MAXFAIL = 3 # max number of retries if download fails
|
||||
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
||||
|
||||
# END CONFIGURATION ##############################################
|
||||
|
||||
wikiindex = "/index.php?title="
|
||||
|
||||
def update(pagename=None):
|
||||
|
||||
if not os.path.exists("revisions.txt"): # case 1)
|
||||
if not os.path.exists("wikifiles.txt"):
|
||||
print("No wikifiles.txt found. Aborting")
|
||||
sys.exit()
|
||||
pages = []
|
||||
f = open("wikifiles.txt","r")
|
||||
if VERBOSE: print("Reading existing list...")
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
if not "/wiki/" in l:
|
||||
if VERBOSE: print("Adding ",l.strip())
|
||||
pages.append(l.strip())
|
||||
f.close()
|
||||
if VERBOSE: print("Added ",str(len(pages))," entries")
|
||||
i = 1
|
||||
revs = []
|
||||
for page in pages:
|
||||
rev = getRevision(page)
|
||||
if VERBOSE: print(str(i)," revision: ",rev)
|
||||
revs.append(page+":"+rev)
|
||||
i += 1
|
||||
writeList(revs,"revisions.txt")
|
||||
print("All done. Successfully written revisions.txt with ",len(revs)," entries.")
|
||||
|
||||
elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")): # case 2)
|
||||
f = open("revisions.txt","r")
|
||||
if VERBOSE: print("Reading revisions list...")
|
||||
revisions = {}
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
r = l.strip().split(":")
|
||||
p = ":".join(r[:-1])
|
||||
if VERBOSE: print("Adding ",p)
|
||||
revisions[p] = r[1]
|
||||
f.close()
|
||||
if VERBOSE: print("Added ",str(len(list(revisions.keys())))," entries")
|
||||
updates = []
|
||||
i = 1
|
||||
for page in list(revisions.keys()):
|
||||
rev = getRevision(page)
|
||||
if rev != revisions[page]:
|
||||
if VERBOSE: print(str(i),page," has a new revision: ",rev)
|
||||
updates.append(page)
|
||||
else:
|
||||
if VERBOSE: print(str(i),page," is up to date ")
|
||||
i += 1
|
||||
if updates:
|
||||
writeList(updates,"updates.txt")
|
||||
print("All done. Successfully written updates.txt with ",len(updates)," entries.")
|
||||
else:
|
||||
print("Everything up to date. Nothing to be done.")
|
||||
|
||||
elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"): # case 3)
|
||||
if not os.path.exists("wikifiles.txt"):
|
||||
print("No wikifiles.txt found. Aborting")
|
||||
sys.exit()
|
||||
wikifiles = []
|
||||
f = open("wikifiles.txt","r")
|
||||
if VERBOSE: print("Reading wikifiles list...")
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
wikifiles.append(l.strip())
|
||||
f.close()
|
||||
if VERBOSE: print("Read ",str(len(wikifiles))," entries")
|
||||
f = open("revisions.txt","r")
|
||||
if VERBOSE: print("Reading revisions list...")
|
||||
revisions = {}
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
r = l.strip().split(":")
|
||||
p = ":".join(r[:-1])
|
||||
revisions[p] = r[1]
|
||||
f.close()
|
||||
todo = []
|
||||
f = open("updates.txt","r")
|
||||
if VERBOSE: print("Reading updates list...")
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
todo.append(l.strip())
|
||||
f.close()
|
||||
if VERBOSE: print(str(len(todo))," pages to scan...")
|
||||
import buildwikiindex
|
||||
buildwikiindex.WRITETHROUGH = False
|
||||
buildwikiindex.VERBOSE = VERBOSE
|
||||
updates = []
|
||||
for t in todo:
|
||||
if VERBOSE: print("Scanning ",t)
|
||||
updates.extend(buildwikiindex.crawl(t))
|
||||
updates = [u for u in updates if not u in wikifiles]
|
||||
if VERBOSE: print(str(len(updates))," files to download...")
|
||||
import downloadwiki
|
||||
i = 1
|
||||
for u in updates:
|
||||
if VERBOSE: print(i, ": Fetching ", u)
|
||||
downloadwiki.get(u)
|
||||
if not "/wiki/" in u:
|
||||
rev = getRevision(u)
|
||||
revisions[u] = rev
|
||||
if not u in wikifiles:
|
||||
wikifiles.append(u)
|
||||
i += 1
|
||||
if VERBOSE: print("Updating wikifiles and revisions...")
|
||||
writeList(wikifiles,"wikifiles.txt")
|
||||
updatedrevs = []
|
||||
for k in list(revisions.keys()):
|
||||
updatedrevs.append(k+":"+revisions[k])
|
||||
writeList(updatedrevs,"revisions.txt")
|
||||
os.remove("updates.txt")
|
||||
if VERBOSE: print("All done!")
|
||||
|
||||
def getRevision(page):
|
||||
html = fetchPage(page)
|
||||
revs = re.findall("wgCurRevisionId\"\:(.*?),",html)
|
||||
if len(revs) == 1:
|
||||
return revs[0]
|
||||
print('Error: unable to get revision ID of ' + page)
|
||||
sys.exit()
|
||||
|
||||
def fetchPage(page):
|
||||
"retrieves given page from the wiki"
|
||||
print("fetching: ",page)
|
||||
failcount = 0
|
||||
while failcount < MAXFAIL:
|
||||
try:
|
||||
html = (urlopen(URL + wikiindex + page).read())
|
||||
return html
|
||||
except HTTPError:
|
||||
failcount += 1
|
||||
print('Error: unable to fetch page ' + page)
|
||||
sys.exit()
|
||||
|
||||
def writeList(pages,filename):
|
||||
f = open(filename,"wb")
|
||||
for p in pages:
|
||||
f.write(p+"\n")
|
||||
f.close()
|
||||
if VERBOSE: print("written ",filename)
|
||||
|
||||
if __name__ == "__main__":
|
||||
update(sys.argv[1:])
|
||||
Reference in New Issue
Block a user