Updated wiki download scripts

2018-02-26 15:12:53 -03:00
parent 3dc0f77065
commit 8aad83f08f
5 changed files with 249 additions and 29 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -28,8 +28,7 @@ install_manifest.txt
 /ZERO_CHECK.dir/
 /build/
 /src/Tools/offlinedoc/localwiki/
-/src/Tools/offlinedoc/todolist.txt
+/src/Tools/offlinedoc/*.txt
 /src/Tools/offlinedoc/wikifiles.txt
 OpenSCAD_rc.py
 .subuser-dev
 /\.idea/
--- a/src/Tools/offlinedoc/README
+++ b/src/Tools/offlinedoc/README
@@ -16,4 +16,13 @@ download and another to actually download the files.
 4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)
-5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
+5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
 6) If you have already downloaded the whole wiki, run "update.py" immediately 
   after, to create a list of revision IDs for each page.
 7) Once the initial revisions list has been created, the "update.py" script
   can be ran anytime in the future, to check for pages that have changed
   since the stored revision ID. The script is meant to run twice, one to get
   a list of pages that have changed, and another one to download the changed
   pages (and all their dependencies) again.
--- a/src/Tools/offlinedoc/buildwikiindex.py
+++ b/src/Tools/offlinedoc/buildwikiindex.py
@@ -36,9 +36,9 @@ from urllib2 import urlopen, HTTPError
 #    CONFIGURATION       #################################################
-URL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
+URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
 INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
-NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
+NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
 GETTRANSLATIONS = False # Set true if you want to get the translations too.
 MAXFAIL = 3 # max number of retries if download fails
 VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
@@ -48,35 +48,37 @@ WRITETHROUGH = True # if true, fetched files are constantly written to disk, in
 wikiindex = "/index.php?title="
-def crawl(pagename):
+def crawl(pagename=[]):
    "downloads an entire wiki site"    
    todolist = []
    processed = []
    count = 1
-    if os.path.exists("wikifiles.txt"):
+    if pagename:
-        f = open("wikifiles.txt","r")
+        if not isinstance(pagename,list):
-        if VERBOSE: print "Reading existing list..."
+            pagename = [pagename]
-        for l in f.readlines():
+        todolist = pagename
            if l.strip() != "":
                if VERBOSE: print "Adding ",l
                processed.append(l.strip())
        f.close()
    if os.path.exists("todolist.txt"):
        f = open("todolist.txt","r")
        if VERBOSE: print "Reading existing todo list..."
        for l in f.readlines():
            if l.strip() != "":
                todolist.append(l.strip())
        f.close()
    else:
-        if pagename:
+        if os.path.exists("wikifiles.txt"):
-            todolist = pagename
+            f = open("wikifiles.txt","r")
            if VERBOSE: print "Reading existing list..."
            for l in f.readlines():
                if l.strip() != "":
                    if VERBOSE: print "Adding ",l
                    processed.append(l.strip())
            f.close()
        if os.path.exists("todolist.txt"):
            f = open("todolist.txt","r")
            if VERBOSE: print "Reading existing todo list..."
            for l in f.readlines():
                if l.strip() != "":
                    todolist.append(l.strip())
            f.close()
        else:
            indexpages,imgs = get(INDEX)
            todolist.extend(indexpages)
    while todolist:
        targetpage = todolist.pop()
-        if not targetpage in NORETRIEVE:
+        if (not targetpage in NORETRIEVE):
            if VERBOSE: print count, ": Scanning ", targetpage
            pages,images = get(targetpage)
            count += 1
@@ -92,6 +94,8 @@ def crawl(pagename):
    if VERBOSE: print "Fetched ", count, " pages"
    if not WRITETHROUGH:
        writeList(processed)
    if pagename:
        return processed
    return 0
 def get(page):
@@ -126,11 +130,16 @@ def cleanhtml(html):
 def getlinks(html):
    "returns a list of wikipage links in html file"
    global NORETRIEVE
    links = re.findall('<a[^>]*>.*?</a>',html)
    pages = []
    for l in links:
        # rg = re.findall('php\?title=(.*)\" title',l)
        rg = re.findall('href=.*?php\?title=(.*?)"',l)
        if not rg:
            rg = re.findall('href="\/wiki\/(.*?)"',l)
            if "images" in rg:
                rg = None
        if rg:
            rg = rg[0]
            if not "Command_Reference" in rg:
@@ -138,21 +147,23 @@ def getlinks(html):
                    rg = rg.split('#')[0]
                if ":" in rg:
                    NORETRIEVE.append(rg)
                if ";" in rg:
                    NORETRIEVE.append(rg)
                if "&" in rg:
                    NORETRIEVE.append(rg)
            if ";" in rg:
                    NORETRIEVE.append(rg)
            if "/" in rg:
                if not GETTRANSLATIONS:
                    NORETRIEVE.append(rg)
            pages.append(rg)
            if not rg in NORETRIEVE:
                pages.append(rg)
                print "got link: ",rg
    return pages
 def getimagelinks(html):
    "returns a list of image links found in an html file"
-    return re.findall('<img.*?src="(.*?)"',html)
+    imlinks = re.findall('<img.*?src="(.*?)"',html)
    imlinks = [l for l in imlinks if not l.startswith("http")] # remove external images
    return imlinks
 def fetchpage(page):
    "retrieves given page from the wiki"
@@ -165,6 +176,7 @@ def fetchpage(page):
        except HTTPError:
            failcount += 1
    print 'Error: unable to fetch page ' + page
    sys.exit()
 def cleanList(pagelist):
    "cleans the list"
--- a/src/Tools/offlinedoc/downloadwiki.py
+++ b/src/Tools/offlinedoc/downloadwiki.py
@@ -35,7 +35,7 @@ from urllib2 import urlopen, HTTPError
 #    CONFIGURATION       #################################################
-DEFAULTURL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
+DEFAULTURL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
 INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
 NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
 GETTRANSLATIONS = False # Set true if you want to get the translations too.
@@ -189,6 +189,8 @@ def getlinks(html):
    for l in links:
        # rg = re.findall('php\?title=(.*)\" title',l)
        rg = re.findall('href=.*?php\?title=(.*?)"',l)
        if not rg:
            rg = re.findall('href="\/wiki\/(.*?)"',l)
        if rg:
            rg = rg[0]
            if not "Command_Reference" in rg:
--- a/src/Tools/offlinedoc/update.py
+++ b/src/Tools/offlinedoc/update.py
@@ -0,0 +1,198 @@
 #!/usr/bin/env python
 #***************************************************************************
 #*                                                                         *
 #*   Copyright (c) 2017 Yorik van Havre <yorik@uncreated.net>              *  
 #*                                                                         *
 #*   This program is free software; you can redistribute it and/or modify  *
 #*   it under the terms of the GNU Lesser General Public License (LGPL)    *
 #*   as published by the Free Software Foundation; either version 2 of     *
 #*   the License, or (at your option) any later version.                   *
 #*   for detail see the LICENCE text file.                                 *
 #*                                                                         *
 #*   This program is distributed in the hope that it will be useful,       *
 #*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 #*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 #*   GNU Library General Public License for more details.                  *
 #*                                                                         *
 #*   You should have received a copy of the GNU Library General Public     *
 #*   License along with this program; if not, write to the Free Software   *
 #*   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  *
 #*   USA                                                                   *
 #*                                                                         *
 #***************************************************************************
 __title__="update.py"
 __author__ = "Yorik van Havre <yorik@uncreated.net>"
 __url__ = "http://www.freecadweb.org"
 """
 This script needs to be ran after the wiki has been fully downloaded. It has three usages:
 1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file
   and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file
 2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of
   wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt.
   An update.txt file is created with all pages that have different revision IDs
 3) If update.txt exists, each entry of it will be scanned again for new links and all the needed 
   files downloaded. Revision.txt and wikifiles.txt get also updated.
 """
 import sys, os, re, tempfile, getopt
 from urllib2 import urlopen, HTTPError
 #    CONFIGURATION       #################################################
 URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
 GETTRANSLATIONS = False # Set true if you want to get the translations too.
 MAXFAIL = 3 # max number of retries if download fails
 VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
 #    END CONFIGURATION      ##############################################
 wikiindex = "/index.php?title="
 def update(pagename=None):
    if not os.path.exists("revisions.txt"):                                             # case 1)
        if not os.path.exists("wikifiles.txt"):
            print "No wikifiles.txt found. Aborting"
            sys.exit()
        pages = []
        f = open("wikifiles.txt","r")
        if VERBOSE: print "Reading existing list..."
        for l in f.readlines():
            if l.strip() != "":
                if not "/wiki/" in l:
                    if VERBOSE: print "Adding ",l.strip()
                    pages.append(l.strip())
        f.close()
        if VERBOSE: print "Added ",str(len(pages))," entries"
        i = 1
        revs = []
        for page in pages:
            rev = getRevision(page)
            if VERBOSE: print str(i),"         revision: ",rev
            revs.append(page+":"+rev)
            i += 1
        writeList(revs,"revisions.txt")
        print "All done. Successfully written revisions.txt with ",len(revs)," entries."
    elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")):        # case 2)
        f = open("revisions.txt","r")
        if VERBOSE: print "Reading revisions list..."
        revisions = {}
        for l in f.readlines():
            if l.strip() != "":
                r = l.strip().split(":")
                p = ":".join(r[:-1])
                if VERBOSE: print "Adding ",p
                revisions[p] = r[1]
        f.close()
        if VERBOSE: print "Added ",str(len(revisions.keys()))," entries"
        updates = []
        i = 1
        for page in revisions.keys():
            rev = getRevision(page)
            if rev != revisions[page]:
                if VERBOSE: print str(i),page," has a new revision: ",rev
                updates.append(page)
            else:
                if VERBOSE: print str(i),page," is up to date "
            i += 1
        if updates:
            writeList(updates,"updates.txt")
            print "All done. Successfully written updates.txt with ",len(updates)," entries."
        else:
            print "Everything up to date. Nothing to be done."
    elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"):              # case 3)
        if not os.path.exists("wikifiles.txt"):
            print "No wikifiles.txt found. Aborting"
            sys.exit()
        wikifiles = []
        f = open("wikifiles.txt","r")
        if VERBOSE: print "Reading wikifiles list..."
        for l in f.readlines():
            if l.strip() != "":
                wikifiles.append(l.strip())
        f.close()
        if VERBOSE: print "Read ",str(len(wikifiles))," entries"
        f = open("revisions.txt","r")
        if VERBOSE: print "Reading revisions list..."
        revisions = {}
        for l in f.readlines():
            if l.strip() != "":
                r = l.strip().split(":")
                p = ":".join(r[:-1])
                revisions[p] = r[1]
        f.close()
        todo = []
        f = open("updates.txt","r")
        if VERBOSE: print "Reading updates list..."
        for l in f.readlines():
            if l.strip() != "":
                todo.append(l.strip())
        f.close()
        if VERBOSE: print str(len(todo))," pages to scan..."
        import buildwikiindex
        buildwikiindex.WRITETHROUGH = False
        buildwikiindex.VERBOSE = VERBOSE
        updates = []
        for t in todo:
            if VERBOSE: print "Scanning ",t
            updates.extend(buildwikiindex.crawl(t))
        updates = [u for u in updates if not u in wikifiles]
        if VERBOSE: print str(len(updates))," files to download..."
        import downloadwiki
        i = 1
        for u in updates:
            if VERBOSE: print i, ": Fetching ", u
            downloadwiki.get(u)
            if not "/wiki/" in u:
                rev = getRevision(u)
                revisions[u] = rev
            if not u in wikifiles:
                wikifiles.append(u)
            i += 1
        if VERBOSE: print "Updating wikifiles and revisions..."
        writeList(wikifiles,"wikifiles.txt")
        updatedrevs = []
        for k in revisions.keys():
            updatedrevs.append(k+":"+revisions[k])
        writeList(updatedrevs,"revisions.txt")
        os.remove("updates.txt")
        if VERBOSE: print "All done!"
 def getRevision(page):
    html = fetchPage(page)
    revs = re.findall("wgCurRevisionId\"\:(.*?),",html)
    if len(revs) == 1:
        return revs[0]
    print 'Error: unable to get revision ID of ' + page
    sys.exit()
 def fetchPage(page):
    "retrieves given page from the wiki"
    print "fetching: ",page
    failcount = 0
    while failcount < MAXFAIL:
        try:
            html = (urlopen(URL + wikiindex + page).read())
            return html
        except HTTPError:
            failcount += 1
    print 'Error: unable to fetch page ' + page
    sys.exit()
 def writeList(pages,filename):
    f = open(filename,"wb")
    for p in pages:
        f.write(p+"\n")
    f.close()
    if VERBOSE: print "written ",filename
 if __name__ == "__main__":
 	update(sys.argv[1:])