diff --git a/.gitignore b/.gitignore
index dcaeb2b20a..4b780f376e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,8 +28,7 @@ install_manifest.txt
/ZERO_CHECK.dir/
/build/
/src/Tools/offlinedoc/localwiki/
-/src/Tools/offlinedoc/todolist.txt
-/src/Tools/offlinedoc/wikifiles.txt
+/src/Tools/offlinedoc/*.txt
OpenSCAD_rc.py
.subuser-dev
/\.idea/
diff --git a/src/Tools/offlinedoc/README b/src/Tools/offlinedoc/README
index addc16eae7..8ffe61dfdb 100644
--- a/src/Tools/offlinedoc/README
+++ b/src/Tools/offlinedoc/README
@@ -16,4 +16,13 @@ download and another to actually download the files.
4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)
-5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
\ No newline at end of file
+5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
+
+6) If you have already downloaded the whole wiki, run "update.py" immediately
+ after, to create a list of revision IDs for each page.
+
+7) Once the initial revisions list has been created, the "update.py" script
+ can be ran anytime in the future, to check for pages that have changed
+ since the stored revision ID. The script is meant to run twice, one to get
+ a list of pages that have changed, and another one to download the changed
+ pages (and all their dependencies) again.
diff --git a/src/Tools/offlinedoc/buildwikiindex.py b/src/Tools/offlinedoc/buildwikiindex.py
index f448abcddc..933b63d701 100755
--- a/src/Tools/offlinedoc/buildwikiindex.py
+++ b/src/Tools/offlinedoc/buildwikiindex.py
@@ -36,9 +36,9 @@ from urllib2 import urlopen, HTTPError
# CONFIGURATION #################################################
-URL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
+URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
-NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
+NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
@@ -48,35 +48,37 @@ WRITETHROUGH = True # if true, fetched files are constantly written to disk, in
wikiindex = "/index.php?title="
-def crawl(pagename):
+def crawl(pagename=[]):
"downloads an entire wiki site"
todolist = []
processed = []
count = 1
- if os.path.exists("wikifiles.txt"):
- f = open("wikifiles.txt","r")
- if VERBOSE: print "Reading existing list..."
- for l in f.readlines():
- if l.strip() != "":
- if VERBOSE: print "Adding ",l
- processed.append(l.strip())
- f.close()
- if os.path.exists("todolist.txt"):
- f = open("todolist.txt","r")
- if VERBOSE: print "Reading existing todo list..."
- for l in f.readlines():
- if l.strip() != "":
- todolist.append(l.strip())
- f.close()
+ if pagename:
+ if not isinstance(pagename,list):
+ pagename = [pagename]
+ todolist = pagename
else:
- if pagename:
- todolist = pagename
+ if os.path.exists("wikifiles.txt"):
+ f = open("wikifiles.txt","r")
+ if VERBOSE: print "Reading existing list..."
+ for l in f.readlines():
+ if l.strip() != "":
+ if VERBOSE: print "Adding ",l
+ processed.append(l.strip())
+ f.close()
+ if os.path.exists("todolist.txt"):
+ f = open("todolist.txt","r")
+ if VERBOSE: print "Reading existing todo list..."
+ for l in f.readlines():
+ if l.strip() != "":
+ todolist.append(l.strip())
+ f.close()
else:
indexpages,imgs = get(INDEX)
todolist.extend(indexpages)
while todolist:
targetpage = todolist.pop()
- if not targetpage in NORETRIEVE:
+ if (not targetpage in NORETRIEVE):
if VERBOSE: print count, ": Scanning ", targetpage
pages,images = get(targetpage)
count += 1
@@ -92,6 +94,8 @@ def crawl(pagename):
if VERBOSE: print "Fetched ", count, " pages"
if not WRITETHROUGH:
writeList(processed)
+ if pagename:
+ return processed
return 0
def get(page):
@@ -126,11 +130,16 @@ def cleanhtml(html):
def getlinks(html):
"returns a list of wikipage links in html file"
+ global NORETRIEVE
links = re.findall(']*>.*?',html)
pages = []
for l in links:
# rg = re.findall('php\?title=(.*)\" title',l)
rg = re.findall('href=.*?php\?title=(.*?)"',l)
+ if not rg:
+ rg = re.findall('href="\/wiki\/(.*?)"',l)
+ if "images" in rg:
+ rg = None
if rg:
rg = rg[0]
if not "Command_Reference" in rg:
@@ -138,21 +147,23 @@ def getlinks(html):
rg = rg.split('#')[0]
if ":" in rg:
NORETRIEVE.append(rg)
- if ";" in rg:
- NORETRIEVE.append(rg)
if "&" in rg:
NORETRIEVE.append(rg)
+ if ";" in rg:
+ NORETRIEVE.append(rg)
if "/" in rg:
if not GETTRANSLATIONS:
NORETRIEVE.append(rg)
- pages.append(rg)
if not rg in NORETRIEVE:
+ pages.append(rg)
print "got link: ",rg
return pages
def getimagelinks(html):
"returns a list of image links found in an html file"
- return re.findall(' *
+#* *
+#* This program is free software; you can redistribute it and/or modify *
+#* it under the terms of the GNU Lesser General Public License (LGPL) *
+#* as published by the Free Software Foundation; either version 2 of *
+#* the License, or (at your option) any later version. *
+#* for detail see the LICENCE text file. *
+#* *
+#* This program is distributed in the hope that it will be useful, *
+#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+#* GNU Library General Public License for more details. *
+#* *
+#* You should have received a copy of the GNU Library General Public *
+#* License along with this program; if not, write to the Free Software *
+#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
+#* USA *
+#* *
+#***************************************************************************
+
+__title__="update.py"
+__author__ = "Yorik van Havre "
+__url__ = "http://www.freecadweb.org"
+
+"""
+This script needs to be ran after the wiki has been fully downloaded. It has three usages:
+
+1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file
+ and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file
+
+2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of
+ wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt.
+ An update.txt file is created with all pages that have different revision IDs
+
+3) If update.txt exists, each entry of it will be scanned again for new links and all the needed
+ files downloaded. Revision.txt and wikifiles.txt get also updated.
+"""
+
+import sys, os, re, tempfile, getopt
+from urllib2 import urlopen, HTTPError
+
+# CONFIGURATION #################################################
+
+URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
+GETTRANSLATIONS = False # Set true if you want to get the translations too.
+MAXFAIL = 3 # max number of retries if download fails
+VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
+
+# END CONFIGURATION ##############################################
+
+wikiindex = "/index.php?title="
+
+def update(pagename=None):
+
+ if not os.path.exists("revisions.txt"): # case 1)
+ if not os.path.exists("wikifiles.txt"):
+ print "No wikifiles.txt found. Aborting"
+ sys.exit()
+ pages = []
+ f = open("wikifiles.txt","r")
+ if VERBOSE: print "Reading existing list..."
+ for l in f.readlines():
+ if l.strip() != "":
+ if not "/wiki/" in l:
+ if VERBOSE: print "Adding ",l.strip()
+ pages.append(l.strip())
+ f.close()
+ if VERBOSE: print "Added ",str(len(pages))," entries"
+ i = 1
+ revs = []
+ for page in pages:
+ rev = getRevision(page)
+ if VERBOSE: print str(i)," revision: ",rev
+ revs.append(page+":"+rev)
+ i += 1
+ writeList(revs,"revisions.txt")
+ print "All done. Successfully written revisions.txt with ",len(revs)," entries."
+
+ elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")): # case 2)
+ f = open("revisions.txt","r")
+ if VERBOSE: print "Reading revisions list..."
+ revisions = {}
+ for l in f.readlines():
+ if l.strip() != "":
+ r = l.strip().split(":")
+ p = ":".join(r[:-1])
+ if VERBOSE: print "Adding ",p
+ revisions[p] = r[1]
+ f.close()
+ if VERBOSE: print "Added ",str(len(revisions.keys()))," entries"
+ updates = []
+ i = 1
+ for page in revisions.keys():
+ rev = getRevision(page)
+ if rev != revisions[page]:
+ if VERBOSE: print str(i),page," has a new revision: ",rev
+ updates.append(page)
+ else:
+ if VERBOSE: print str(i),page," is up to date "
+ i += 1
+ if updates:
+ writeList(updates,"updates.txt")
+ print "All done. Successfully written updates.txt with ",len(updates)," entries."
+ else:
+ print "Everything up to date. Nothing to be done."
+
+ elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"): # case 3)
+ if not os.path.exists("wikifiles.txt"):
+ print "No wikifiles.txt found. Aborting"
+ sys.exit()
+ wikifiles = []
+ f = open("wikifiles.txt","r")
+ if VERBOSE: print "Reading wikifiles list..."
+ for l in f.readlines():
+ if l.strip() != "":
+ wikifiles.append(l.strip())
+ f.close()
+ if VERBOSE: print "Read ",str(len(wikifiles))," entries"
+ f = open("revisions.txt","r")
+ if VERBOSE: print "Reading revisions list..."
+ revisions = {}
+ for l in f.readlines():
+ if l.strip() != "":
+ r = l.strip().split(":")
+ p = ":".join(r[:-1])
+ revisions[p] = r[1]
+ f.close()
+ todo = []
+ f = open("updates.txt","r")
+ if VERBOSE: print "Reading updates list..."
+ for l in f.readlines():
+ if l.strip() != "":
+ todo.append(l.strip())
+ f.close()
+ if VERBOSE: print str(len(todo))," pages to scan..."
+ import buildwikiindex
+ buildwikiindex.WRITETHROUGH = False
+ buildwikiindex.VERBOSE = VERBOSE
+ updates = []
+ for t in todo:
+ if VERBOSE: print "Scanning ",t
+ updates.extend(buildwikiindex.crawl(t))
+ updates = [u for u in updates if not u in wikifiles]
+ if VERBOSE: print str(len(updates))," files to download..."
+ import downloadwiki
+ i = 1
+ for u in updates:
+ if VERBOSE: print i, ": Fetching ", u
+ downloadwiki.get(u)
+ if not "/wiki/" in u:
+ rev = getRevision(u)
+ revisions[u] = rev
+ if not u in wikifiles:
+ wikifiles.append(u)
+ i += 1
+ if VERBOSE: print "Updating wikifiles and revisions..."
+ writeList(wikifiles,"wikifiles.txt")
+ updatedrevs = []
+ for k in revisions.keys():
+ updatedrevs.append(k+":"+revisions[k])
+ writeList(updatedrevs,"revisions.txt")
+ os.remove("updates.txt")
+ if VERBOSE: print "All done!"
+
+def getRevision(page):
+ html = fetchPage(page)
+ revs = re.findall("wgCurRevisionId\"\:(.*?),",html)
+ if len(revs) == 1:
+ return revs[0]
+ print 'Error: unable to get revision ID of ' + page
+ sys.exit()
+
+def fetchPage(page):
+ "retrieves given page from the wiki"
+ print "fetching: ",page
+ failcount = 0
+ while failcount < MAXFAIL:
+ try:
+ html = (urlopen(URL + wikiindex + page).read())
+ return html
+ except HTTPError:
+ failcount += 1
+ print 'Error: unable to fetch page ' + page
+ sys.exit()
+
+def writeList(pages,filename):
+ f = open(filename,"wb")
+ for p in pages:
+ f.write(p+"\n")
+ f.close()
+ if VERBOSE: print "written ",filename
+
+if __name__ == "__main__":
+ update(sys.argv[1:])