From 8aad83f08f7a9e4da9c42b42d14f4ff514ee93c6 Mon Sep 17 00:00:00 2001 From: Yorik van Havre Date: Mon, 26 Feb 2018 15:12:53 -0300 Subject: [PATCH] Updated wiki download scripts --- .gitignore | 3 +- src/Tools/offlinedoc/README | 11 +- src/Tools/offlinedoc/buildwikiindex.py | 62 ++++---- src/Tools/offlinedoc/downloadwiki.py | 4 +- src/Tools/offlinedoc/update.py | 198 +++++++++++++++++++++++++ 5 files changed, 249 insertions(+), 29 deletions(-) create mode 100755 src/Tools/offlinedoc/update.py diff --git a/.gitignore b/.gitignore index dcaeb2b20a..4b780f376e 100644 --- a/.gitignore +++ b/.gitignore @@ -28,8 +28,7 @@ install_manifest.txt /ZERO_CHECK.dir/ /build/ /src/Tools/offlinedoc/localwiki/ -/src/Tools/offlinedoc/todolist.txt -/src/Tools/offlinedoc/wikifiles.txt +/src/Tools/offlinedoc/*.txt OpenSCAD_rc.py .subuser-dev /\.idea/ diff --git a/src/Tools/offlinedoc/README b/src/Tools/offlinedoc/README index addc16eae7..8ffe61dfdb 100644 --- a/src/Tools/offlinedoc/README +++ b/src/Tools/offlinedoc/README @@ -16,4 +16,13 @@ download and another to actually download the files. 4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed) -5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc" \ No newline at end of file +5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc" + +6) If you have already downloaded the whole wiki, run "update.py" immediately + after, to create a list of revision IDs for each page. + +7) Once the initial revisions list has been created, the "update.py" script + can be ran anytime in the future, to check for pages that have changed + since the stored revision ID. The script is meant to run twice, one to get + a list of pages that have changed, and another one to download the changed + pages (and all their dependencies) again. diff --git a/src/Tools/offlinedoc/buildwikiindex.py b/src/Tools/offlinedoc/buildwikiindex.py index f448abcddc..933b63d701 100755 --- a/src/Tools/offlinedoc/buildwikiindex.py +++ b/src/Tools/offlinedoc/buildwikiindex.py @@ -36,9 +36,9 @@ from urllib2 import urlopen, HTTPError # CONFIGURATION ################################################# -URL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed +URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki -NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online) +NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online) GETTRANSLATIONS = False # Set true if you want to get the translations too. MAXFAIL = 3 # max number of retries if download fails VERBOSE = True # to display what's going on. Otherwise, runs totally silent. @@ -48,35 +48,37 @@ WRITETHROUGH = True # if true, fetched files are constantly written to disk, in wikiindex = "/index.php?title=" -def crawl(pagename): +def crawl(pagename=[]): "downloads an entire wiki site" todolist = [] processed = [] count = 1 - if os.path.exists("wikifiles.txt"): - f = open("wikifiles.txt","r") - if VERBOSE: print "Reading existing list..." - for l in f.readlines(): - if l.strip() != "": - if VERBOSE: print "Adding ",l - processed.append(l.strip()) - f.close() - if os.path.exists("todolist.txt"): - f = open("todolist.txt","r") - if VERBOSE: print "Reading existing todo list..." - for l in f.readlines(): - if l.strip() != "": - todolist.append(l.strip()) - f.close() + if pagename: + if not isinstance(pagename,list): + pagename = [pagename] + todolist = pagename else: - if pagename: - todolist = pagename + if os.path.exists("wikifiles.txt"): + f = open("wikifiles.txt","r") + if VERBOSE: print "Reading existing list..." + for l in f.readlines(): + if l.strip() != "": + if VERBOSE: print "Adding ",l + processed.append(l.strip()) + f.close() + if os.path.exists("todolist.txt"): + f = open("todolist.txt","r") + if VERBOSE: print "Reading existing todo list..." + for l in f.readlines(): + if l.strip() != "": + todolist.append(l.strip()) + f.close() else: indexpages,imgs = get(INDEX) todolist.extend(indexpages) while todolist: targetpage = todolist.pop() - if not targetpage in NORETRIEVE: + if (not targetpage in NORETRIEVE): if VERBOSE: print count, ": Scanning ", targetpage pages,images = get(targetpage) count += 1 @@ -92,6 +94,8 @@ def crawl(pagename): if VERBOSE: print "Fetched ", count, " pages" if not WRITETHROUGH: writeList(processed) + if pagename: + return processed return 0 def get(page): @@ -126,11 +130,16 @@ def cleanhtml(html): def getlinks(html): "returns a list of wikipage links in html file" + global NORETRIEVE links = re.findall(']*>.*?',html) pages = [] for l in links: # rg = re.findall('php\?title=(.*)\" title',l) rg = re.findall('href=.*?php\?title=(.*?)"',l) + if not rg: + rg = re.findall('href="\/wiki\/(.*?)"',l) + if "images" in rg: + rg = None if rg: rg = rg[0] if not "Command_Reference" in rg: @@ -138,21 +147,23 @@ def getlinks(html): rg = rg.split('#')[0] if ":" in rg: NORETRIEVE.append(rg) - if ";" in rg: - NORETRIEVE.append(rg) if "&" in rg: NORETRIEVE.append(rg) + if ";" in rg: + NORETRIEVE.append(rg) if "/" in rg: if not GETTRANSLATIONS: NORETRIEVE.append(rg) - pages.append(rg) if not rg in NORETRIEVE: + pages.append(rg) print "got link: ",rg return pages def getimagelinks(html): "returns a list of image links found in an html file" - return re.findall(' * +#* * +#* This program is free software; you can redistribute it and/or modify * +#* it under the terms of the GNU Lesser General Public License (LGPL) * +#* as published by the Free Software Foundation; either version 2 of * +#* the License, or (at your option) any later version. * +#* for detail see the LICENCE text file. * +#* * +#* This program is distributed in the hope that it will be useful, * +#* but WITHOUT ANY WARRANTY; without even the implied warranty of * +#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +#* GNU Library General Public License for more details. * +#* * +#* You should have received a copy of the GNU Library General Public * +#* License along with this program; if not, write to the Free Software * +#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * +#* USA * +#* * +#*************************************************************************** + +__title__="update.py" +__author__ = "Yorik van Havre " +__url__ = "http://www.freecadweb.org" + +""" +This script needs to be ran after the wiki has been fully downloaded. It has three usages: + +1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file + and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file + +2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of + wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt. + An update.txt file is created with all pages that have different revision IDs + +3) If update.txt exists, each entry of it will be scanned again for new links and all the needed + files downloaded. Revision.txt and wikifiles.txt get also updated. +""" + +import sys, os, re, tempfile, getopt +from urllib2 import urlopen, HTTPError + +# CONFIGURATION ################################################# + +URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed +GETTRANSLATIONS = False # Set true if you want to get the translations too. +MAXFAIL = 3 # max number of retries if download fails +VERBOSE = True # to display what's going on. Otherwise, runs totally silent. + +# END CONFIGURATION ############################################## + +wikiindex = "/index.php?title=" + +def update(pagename=None): + + if not os.path.exists("revisions.txt"): # case 1) + if not os.path.exists("wikifiles.txt"): + print "No wikifiles.txt found. Aborting" + sys.exit() + pages = [] + f = open("wikifiles.txt","r") + if VERBOSE: print "Reading existing list..." + for l in f.readlines(): + if l.strip() != "": + if not "/wiki/" in l: + if VERBOSE: print "Adding ",l.strip() + pages.append(l.strip()) + f.close() + if VERBOSE: print "Added ",str(len(pages))," entries" + i = 1 + revs = [] + for page in pages: + rev = getRevision(page) + if VERBOSE: print str(i)," revision: ",rev + revs.append(page+":"+rev) + i += 1 + writeList(revs,"revisions.txt") + print "All done. Successfully written revisions.txt with ",len(revs)," entries." + + elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")): # case 2) + f = open("revisions.txt","r") + if VERBOSE: print "Reading revisions list..." + revisions = {} + for l in f.readlines(): + if l.strip() != "": + r = l.strip().split(":") + p = ":".join(r[:-1]) + if VERBOSE: print "Adding ",p + revisions[p] = r[1] + f.close() + if VERBOSE: print "Added ",str(len(revisions.keys()))," entries" + updates = [] + i = 1 + for page in revisions.keys(): + rev = getRevision(page) + if rev != revisions[page]: + if VERBOSE: print str(i),page," has a new revision: ",rev + updates.append(page) + else: + if VERBOSE: print str(i),page," is up to date " + i += 1 + if updates: + writeList(updates,"updates.txt") + print "All done. Successfully written updates.txt with ",len(updates)," entries." + else: + print "Everything up to date. Nothing to be done." + + elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"): # case 3) + if not os.path.exists("wikifiles.txt"): + print "No wikifiles.txt found. Aborting" + sys.exit() + wikifiles = [] + f = open("wikifiles.txt","r") + if VERBOSE: print "Reading wikifiles list..." + for l in f.readlines(): + if l.strip() != "": + wikifiles.append(l.strip()) + f.close() + if VERBOSE: print "Read ",str(len(wikifiles))," entries" + f = open("revisions.txt","r") + if VERBOSE: print "Reading revisions list..." + revisions = {} + for l in f.readlines(): + if l.strip() != "": + r = l.strip().split(":") + p = ":".join(r[:-1]) + revisions[p] = r[1] + f.close() + todo = [] + f = open("updates.txt","r") + if VERBOSE: print "Reading updates list..." + for l in f.readlines(): + if l.strip() != "": + todo.append(l.strip()) + f.close() + if VERBOSE: print str(len(todo))," pages to scan..." + import buildwikiindex + buildwikiindex.WRITETHROUGH = False + buildwikiindex.VERBOSE = VERBOSE + updates = [] + for t in todo: + if VERBOSE: print "Scanning ",t + updates.extend(buildwikiindex.crawl(t)) + updates = [u for u in updates if not u in wikifiles] + if VERBOSE: print str(len(updates))," files to download..." + import downloadwiki + i = 1 + for u in updates: + if VERBOSE: print i, ": Fetching ", u + downloadwiki.get(u) + if not "/wiki/" in u: + rev = getRevision(u) + revisions[u] = rev + if not u in wikifiles: + wikifiles.append(u) + i += 1 + if VERBOSE: print "Updating wikifiles and revisions..." + writeList(wikifiles,"wikifiles.txt") + updatedrevs = [] + for k in revisions.keys(): + updatedrevs.append(k+":"+revisions[k]) + writeList(updatedrevs,"revisions.txt") + os.remove("updates.txt") + if VERBOSE: print "All done!" + +def getRevision(page): + html = fetchPage(page) + revs = re.findall("wgCurRevisionId\"\:(.*?),",html) + if len(revs) == 1: + return revs[0] + print 'Error: unable to get revision ID of ' + page + sys.exit() + +def fetchPage(page): + "retrieves given page from the wiki" + print "fetching: ",page + failcount = 0 + while failcount < MAXFAIL: + try: + html = (urlopen(URL + wikiindex + page).read()) + return html + except HTTPError: + failcount += 1 + print 'Error: unable to fetch page ' + page + sys.exit() + +def writeList(pages,filename): + f = open(filename,"wb") + for p in pages: + f.write(p+"\n") + f.close() + if VERBOSE: print "written ",filename + +if __name__ == "__main__": + update(sys.argv[1:])