Updated wiki download scripts

This commit is contained in:
Yorik van Havre
2018-02-26 15:12:53 -03:00
parent 3dc0f77065
commit 8aad83f08f
5 changed files with 249 additions and 29 deletions

3
.gitignore vendored
View File

@@ -28,8 +28,7 @@ install_manifest.txt
/ZERO_CHECK.dir/ /ZERO_CHECK.dir/
/build/ /build/
/src/Tools/offlinedoc/localwiki/ /src/Tools/offlinedoc/localwiki/
/src/Tools/offlinedoc/todolist.txt /src/Tools/offlinedoc/*.txt
/src/Tools/offlinedoc/wikifiles.txt
OpenSCAD_rc.py OpenSCAD_rc.py
.subuser-dev .subuser-dev
/\.idea/ /\.idea/

View File

@@ -16,4 +16,13 @@ download and another to actually download the files.
4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed) 4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)
5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc" 5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
6) If you have already downloaded the whole wiki, run "update.py" immediately
after, to create a list of revision IDs for each page.
7) Once the initial revisions list has been created, the "update.py" script
can be ran anytime in the future, to check for pages that have changed
since the stored revision ID. The script is meant to run twice, one to get
a list of pages that have changed, and another one to download the changed
pages (and all their dependencies) again.

View File

@@ -36,9 +36,9 @@ from urllib2 import urlopen, HTTPError
# CONFIGURATION ################################################# # CONFIGURATION #################################################
URL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online) NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
GETTRANSLATIONS = False # Set true if you want to get the translations too. GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent. VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
@@ -48,35 +48,37 @@ WRITETHROUGH = True # if true, fetched files are constantly written to disk, in
wikiindex = "/index.php?title=" wikiindex = "/index.php?title="
def crawl(pagename): def crawl(pagename=[]):
"downloads an entire wiki site" "downloads an entire wiki site"
todolist = [] todolist = []
processed = [] processed = []
count = 1 count = 1
if os.path.exists("wikifiles.txt"): if pagename:
f = open("wikifiles.txt","r") if not isinstance(pagename,list):
if VERBOSE: print "Reading existing list..." pagename = [pagename]
for l in f.readlines(): todolist = pagename
if l.strip() != "":
if VERBOSE: print "Adding ",l
processed.append(l.strip())
f.close()
if os.path.exists("todolist.txt"):
f = open("todolist.txt","r")
if VERBOSE: print "Reading existing todo list..."
for l in f.readlines():
if l.strip() != "":
todolist.append(l.strip())
f.close()
else: else:
if pagename: if os.path.exists("wikifiles.txt"):
todolist = pagename f = open("wikifiles.txt","r")
if VERBOSE: print "Reading existing list..."
for l in f.readlines():
if l.strip() != "":
if VERBOSE: print "Adding ",l
processed.append(l.strip())
f.close()
if os.path.exists("todolist.txt"):
f = open("todolist.txt","r")
if VERBOSE: print "Reading existing todo list..."
for l in f.readlines():
if l.strip() != "":
todolist.append(l.strip())
f.close()
else: else:
indexpages,imgs = get(INDEX) indexpages,imgs = get(INDEX)
todolist.extend(indexpages) todolist.extend(indexpages)
while todolist: while todolist:
targetpage = todolist.pop() targetpage = todolist.pop()
if not targetpage in NORETRIEVE: if (not targetpage in NORETRIEVE):
if VERBOSE: print count, ": Scanning ", targetpage if VERBOSE: print count, ": Scanning ", targetpage
pages,images = get(targetpage) pages,images = get(targetpage)
count += 1 count += 1
@@ -92,6 +94,8 @@ def crawl(pagename):
if VERBOSE: print "Fetched ", count, " pages" if VERBOSE: print "Fetched ", count, " pages"
if not WRITETHROUGH: if not WRITETHROUGH:
writeList(processed) writeList(processed)
if pagename:
return processed
return 0 return 0
def get(page): def get(page):
@@ -126,11 +130,16 @@ def cleanhtml(html):
def getlinks(html): def getlinks(html):
"returns a list of wikipage links in html file" "returns a list of wikipage links in html file"
global NORETRIEVE
links = re.findall('<a[^>]*>.*?</a>',html) links = re.findall('<a[^>]*>.*?</a>',html)
pages = [] pages = []
for l in links: for l in links:
# rg = re.findall('php\?title=(.*)\" title',l) # rg = re.findall('php\?title=(.*)\" title',l)
rg = re.findall('href=.*?php\?title=(.*?)"',l) rg = re.findall('href=.*?php\?title=(.*?)"',l)
if not rg:
rg = re.findall('href="\/wiki\/(.*?)"',l)
if "images" in rg:
rg = None
if rg: if rg:
rg = rg[0] rg = rg[0]
if not "Command_Reference" in rg: if not "Command_Reference" in rg:
@@ -138,21 +147,23 @@ def getlinks(html):
rg = rg.split('#')[0] rg = rg.split('#')[0]
if ":" in rg: if ":" in rg:
NORETRIEVE.append(rg) NORETRIEVE.append(rg)
if ";" in rg:
NORETRIEVE.append(rg)
if "&" in rg: if "&" in rg:
NORETRIEVE.append(rg) NORETRIEVE.append(rg)
if ";" in rg:
NORETRIEVE.append(rg)
if "/" in rg: if "/" in rg:
if not GETTRANSLATIONS: if not GETTRANSLATIONS:
NORETRIEVE.append(rg) NORETRIEVE.append(rg)
pages.append(rg)
if not rg in NORETRIEVE: if not rg in NORETRIEVE:
pages.append(rg)
print "got link: ",rg print "got link: ",rg
return pages return pages
def getimagelinks(html): def getimagelinks(html):
"returns a list of image links found in an html file" "returns a list of image links found in an html file"
return re.findall('<img.*?src="(.*?)"',html) imlinks = re.findall('<img.*?src="(.*?)"',html)
imlinks = [l for l in imlinks if not l.startswith("http")] # remove external images
return imlinks
def fetchpage(page): def fetchpage(page):
"retrieves given page from the wiki" "retrieves given page from the wiki"
@@ -165,6 +176,7 @@ def fetchpage(page):
except HTTPError: except HTTPError:
failcount += 1 failcount += 1
print 'Error: unable to fetch page ' + page print 'Error: unable to fetch page ' + page
sys.exit()
def cleanList(pagelist): def cleanList(pagelist):
"cleans the list" "cleans the list"

View File

@@ -35,7 +35,7 @@ from urllib2 import urlopen, HTTPError
# CONFIGURATION ################################################# # CONFIGURATION #################################################
DEFAULTURL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed DEFAULTURL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online) NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
GETTRANSLATIONS = False # Set true if you want to get the translations too. GETTRANSLATIONS = False # Set true if you want to get the translations too.
@@ -189,6 +189,8 @@ def getlinks(html):
for l in links: for l in links:
# rg = re.findall('php\?title=(.*)\" title',l) # rg = re.findall('php\?title=(.*)\" title',l)
rg = re.findall('href=.*?php\?title=(.*?)"',l) rg = re.findall('href=.*?php\?title=(.*?)"',l)
if not rg:
rg = re.findall('href="\/wiki\/(.*?)"',l)
if rg: if rg:
rg = rg[0] rg = rg[0]
if not "Command_Reference" in rg: if not "Command_Reference" in rg:

198
src/Tools/offlinedoc/update.py Executable file
View File

@@ -0,0 +1,198 @@
#!/usr/bin/env python
#***************************************************************************
#* *
#* Copyright (c) 2017 Yorik van Havre <yorik@uncreated.net> *
#* *
#* This program is free software; you can redistribute it and/or modify *
#* it under the terms of the GNU Lesser General Public License (LGPL) *
#* as published by the Free Software Foundation; either version 2 of *
#* the License, or (at your option) any later version. *
#* for detail see the LICENCE text file. *
#* *
#* This program is distributed in the hope that it will be useful, *
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
#* GNU Library General Public License for more details. *
#* *
#* You should have received a copy of the GNU Library General Public *
#* License along with this program; if not, write to the Free Software *
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
#* USA *
#* *
#***************************************************************************
__title__="update.py"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://www.freecadweb.org"
"""
This script needs to be ran after the wiki has been fully downloaded. It has three usages:
1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file
and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file
2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of
wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt.
An update.txt file is created with all pages that have different revision IDs
3) If update.txt exists, each entry of it will be scanned again for new links and all the needed
files downloaded. Revision.txt and wikifiles.txt get also updated.
"""
import sys, os, re, tempfile, getopt
from urllib2 import urlopen, HTTPError
# CONFIGURATION #################################################
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
# END CONFIGURATION ##############################################
wikiindex = "/index.php?title="
def update(pagename=None):
if not os.path.exists("revisions.txt"): # case 1)
if not os.path.exists("wikifiles.txt"):
print "No wikifiles.txt found. Aborting"
sys.exit()
pages = []
f = open("wikifiles.txt","r")
if VERBOSE: print "Reading existing list..."
for l in f.readlines():
if l.strip() != "":
if not "/wiki/" in l:
if VERBOSE: print "Adding ",l.strip()
pages.append(l.strip())
f.close()
if VERBOSE: print "Added ",str(len(pages))," entries"
i = 1
revs = []
for page in pages:
rev = getRevision(page)
if VERBOSE: print str(i)," revision: ",rev
revs.append(page+":"+rev)
i += 1
writeList(revs,"revisions.txt")
print "All done. Successfully written revisions.txt with ",len(revs)," entries."
elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")): # case 2)
f = open("revisions.txt","r")
if VERBOSE: print "Reading revisions list..."
revisions = {}
for l in f.readlines():
if l.strip() != "":
r = l.strip().split(":")
p = ":".join(r[:-1])
if VERBOSE: print "Adding ",p
revisions[p] = r[1]
f.close()
if VERBOSE: print "Added ",str(len(revisions.keys()))," entries"
updates = []
i = 1
for page in revisions.keys():
rev = getRevision(page)
if rev != revisions[page]:
if VERBOSE: print str(i),page," has a new revision: ",rev
updates.append(page)
else:
if VERBOSE: print str(i),page," is up to date "
i += 1
if updates:
writeList(updates,"updates.txt")
print "All done. Successfully written updates.txt with ",len(updates)," entries."
else:
print "Everything up to date. Nothing to be done."
elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"): # case 3)
if not os.path.exists("wikifiles.txt"):
print "No wikifiles.txt found. Aborting"
sys.exit()
wikifiles = []
f = open("wikifiles.txt","r")
if VERBOSE: print "Reading wikifiles list..."
for l in f.readlines():
if l.strip() != "":
wikifiles.append(l.strip())
f.close()
if VERBOSE: print "Read ",str(len(wikifiles))," entries"
f = open("revisions.txt","r")
if VERBOSE: print "Reading revisions list..."
revisions = {}
for l in f.readlines():
if l.strip() != "":
r = l.strip().split(":")
p = ":".join(r[:-1])
revisions[p] = r[1]
f.close()
todo = []
f = open("updates.txt","r")
if VERBOSE: print "Reading updates list..."
for l in f.readlines():
if l.strip() != "":
todo.append(l.strip())
f.close()
if VERBOSE: print str(len(todo))," pages to scan..."
import buildwikiindex
buildwikiindex.WRITETHROUGH = False
buildwikiindex.VERBOSE = VERBOSE
updates = []
for t in todo:
if VERBOSE: print "Scanning ",t
updates.extend(buildwikiindex.crawl(t))
updates = [u for u in updates if not u in wikifiles]
if VERBOSE: print str(len(updates))," files to download..."
import downloadwiki
i = 1
for u in updates:
if VERBOSE: print i, ": Fetching ", u
downloadwiki.get(u)
if not "/wiki/" in u:
rev = getRevision(u)
revisions[u] = rev
if not u in wikifiles:
wikifiles.append(u)
i += 1
if VERBOSE: print "Updating wikifiles and revisions..."
writeList(wikifiles,"wikifiles.txt")
updatedrevs = []
for k in revisions.keys():
updatedrevs.append(k+":"+revisions[k])
writeList(updatedrevs,"revisions.txt")
os.remove("updates.txt")
if VERBOSE: print "All done!"
def getRevision(page):
html = fetchPage(page)
revs = re.findall("wgCurRevisionId\"\:(.*?),",html)
if len(revs) == 1:
return revs[0]
print 'Error: unable to get revision ID of ' + page
sys.exit()
def fetchPage(page):
"retrieves given page from the wiki"
print "fetching: ",page
failcount = 0
while failcount < MAXFAIL:
try:
html = (urlopen(URL + wikiindex + page).read())
return html
except HTTPError:
failcount += 1
print 'Error: unable to fetch page ' + page
sys.exit()
def writeList(pages,filename):
f = open(filename,"wb")
for p in pages:
f.write(p+"\n")
f.close()
if VERBOSE: print "written ",filename
if __name__ == "__main__":
update(sys.argv[1:])