Updated wiki download scripts
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -28,8 +28,7 @@ install_manifest.txt
|
||||
/ZERO_CHECK.dir/
|
||||
/build/
|
||||
/src/Tools/offlinedoc/localwiki/
|
||||
/src/Tools/offlinedoc/todolist.txt
|
||||
/src/Tools/offlinedoc/wikifiles.txt
|
||||
/src/Tools/offlinedoc/*.txt
|
||||
OpenSCAD_rc.py
|
||||
.subuser-dev
|
||||
/\.idea/
|
||||
|
||||
@@ -16,4 +16,13 @@ download and another to actually download the files.
|
||||
|
||||
4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)
|
||||
|
||||
5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
|
||||
5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
|
||||
|
||||
6) If you have already downloaded the whole wiki, run "update.py" immediately
|
||||
after, to create a list of revision IDs for each page.
|
||||
|
||||
7) Once the initial revisions list has been created, the "update.py" script
|
||||
can be ran anytime in the future, to check for pages that have changed
|
||||
since the stored revision ID. The script is meant to run twice, one to get
|
||||
a list of pages that have changed, and another one to download the changed
|
||||
pages (and all their dependencies) again.
|
||||
|
||||
@@ -36,9 +36,9 @@ from urllib2 import urlopen, HTTPError
|
||||
|
||||
# CONFIGURATION #################################################
|
||||
|
||||
URL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
||||
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
|
||||
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
|
||||
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||
MAXFAIL = 3 # max number of retries if download fails
|
||||
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
||||
@@ -48,35 +48,37 @@ WRITETHROUGH = True # if true, fetched files are constantly written to disk, in
|
||||
|
||||
wikiindex = "/index.php?title="
|
||||
|
||||
def crawl(pagename):
|
||||
def crawl(pagename=[]):
|
||||
"downloads an entire wiki site"
|
||||
todolist = []
|
||||
processed = []
|
||||
count = 1
|
||||
if os.path.exists("wikifiles.txt"):
|
||||
f = open("wikifiles.txt","r")
|
||||
if VERBOSE: print "Reading existing list..."
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
if VERBOSE: print "Adding ",l
|
||||
processed.append(l.strip())
|
||||
f.close()
|
||||
if os.path.exists("todolist.txt"):
|
||||
f = open("todolist.txt","r")
|
||||
if VERBOSE: print "Reading existing todo list..."
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
todolist.append(l.strip())
|
||||
f.close()
|
||||
if pagename:
|
||||
if not isinstance(pagename,list):
|
||||
pagename = [pagename]
|
||||
todolist = pagename
|
||||
else:
|
||||
if pagename:
|
||||
todolist = pagename
|
||||
if os.path.exists("wikifiles.txt"):
|
||||
f = open("wikifiles.txt","r")
|
||||
if VERBOSE: print "Reading existing list..."
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
if VERBOSE: print "Adding ",l
|
||||
processed.append(l.strip())
|
||||
f.close()
|
||||
if os.path.exists("todolist.txt"):
|
||||
f = open("todolist.txt","r")
|
||||
if VERBOSE: print "Reading existing todo list..."
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
todolist.append(l.strip())
|
||||
f.close()
|
||||
else:
|
||||
indexpages,imgs = get(INDEX)
|
||||
todolist.extend(indexpages)
|
||||
while todolist:
|
||||
targetpage = todolist.pop()
|
||||
if not targetpage in NORETRIEVE:
|
||||
if (not targetpage in NORETRIEVE):
|
||||
if VERBOSE: print count, ": Scanning ", targetpage
|
||||
pages,images = get(targetpage)
|
||||
count += 1
|
||||
@@ -92,6 +94,8 @@ def crawl(pagename):
|
||||
if VERBOSE: print "Fetched ", count, " pages"
|
||||
if not WRITETHROUGH:
|
||||
writeList(processed)
|
||||
if pagename:
|
||||
return processed
|
||||
return 0
|
||||
|
||||
def get(page):
|
||||
@@ -126,11 +130,16 @@ def cleanhtml(html):
|
||||
|
||||
def getlinks(html):
|
||||
"returns a list of wikipage links in html file"
|
||||
global NORETRIEVE
|
||||
links = re.findall('<a[^>]*>.*?</a>',html)
|
||||
pages = []
|
||||
for l in links:
|
||||
# rg = re.findall('php\?title=(.*)\" title',l)
|
||||
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
||||
if not rg:
|
||||
rg = re.findall('href="\/wiki\/(.*?)"',l)
|
||||
if "images" in rg:
|
||||
rg = None
|
||||
if rg:
|
||||
rg = rg[0]
|
||||
if not "Command_Reference" in rg:
|
||||
@@ -138,21 +147,23 @@ def getlinks(html):
|
||||
rg = rg.split('#')[0]
|
||||
if ":" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if ";" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if "&" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if ";" in rg:
|
||||
NORETRIEVE.append(rg)
|
||||
if "/" in rg:
|
||||
if not GETTRANSLATIONS:
|
||||
NORETRIEVE.append(rg)
|
||||
pages.append(rg)
|
||||
if not rg in NORETRIEVE:
|
||||
pages.append(rg)
|
||||
print "got link: ",rg
|
||||
return pages
|
||||
|
||||
def getimagelinks(html):
|
||||
"returns a list of image links found in an html file"
|
||||
return re.findall('<img.*?src="(.*?)"',html)
|
||||
imlinks = re.findall('<img.*?src="(.*?)"',html)
|
||||
imlinks = [l for l in imlinks if not l.startswith("http")] # remove external images
|
||||
return imlinks
|
||||
|
||||
def fetchpage(page):
|
||||
"retrieves given page from the wiki"
|
||||
@@ -165,6 +176,7 @@ def fetchpage(page):
|
||||
except HTTPError:
|
||||
failcount += 1
|
||||
print 'Error: unable to fetch page ' + page
|
||||
sys.exit()
|
||||
|
||||
def cleanList(pagelist):
|
||||
"cleans the list"
|
||||
|
||||
@@ -35,7 +35,7 @@ from urllib2 import urlopen, HTTPError
|
||||
|
||||
# CONFIGURATION #################################################
|
||||
|
||||
DEFAULTURL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||
DEFAULTURL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
||||
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
|
||||
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||
@@ -189,6 +189,8 @@ def getlinks(html):
|
||||
for l in links:
|
||||
# rg = re.findall('php\?title=(.*)\" title',l)
|
||||
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
||||
if not rg:
|
||||
rg = re.findall('href="\/wiki\/(.*?)"',l)
|
||||
if rg:
|
||||
rg = rg[0]
|
||||
if not "Command_Reference" in rg:
|
||||
|
||||
198
src/Tools/offlinedoc/update.py
Executable file
198
src/Tools/offlinedoc/update.py
Executable file
@@ -0,0 +1,198 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#***************************************************************************
|
||||
#* *
|
||||
#* Copyright (c) 2017 Yorik van Havre <yorik@uncreated.net> *
|
||||
#* *
|
||||
#* This program is free software; you can redistribute it and/or modify *
|
||||
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
||||
#* as published by the Free Software Foundation; either version 2 of *
|
||||
#* the License, or (at your option) any later version. *
|
||||
#* for detail see the LICENCE text file. *
|
||||
#* *
|
||||
#* This program is distributed in the hope that it will be useful, *
|
||||
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
#* GNU Library General Public License for more details. *
|
||||
#* *
|
||||
#* You should have received a copy of the GNU Library General Public *
|
||||
#* License along with this program; if not, write to the Free Software *
|
||||
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
||||
#* USA *
|
||||
#* *
|
||||
#***************************************************************************
|
||||
|
||||
__title__="update.py"
|
||||
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
||||
__url__ = "http://www.freecadweb.org"
|
||||
|
||||
"""
|
||||
This script needs to be ran after the wiki has been fully downloaded. It has three usages:
|
||||
|
||||
1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file
|
||||
and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file
|
||||
|
||||
2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of
|
||||
wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt.
|
||||
An update.txt file is created with all pages that have different revision IDs
|
||||
|
||||
3) If update.txt exists, each entry of it will be scanned again for new links and all the needed
|
||||
files downloaded. Revision.txt and wikifiles.txt get also updated.
|
||||
"""
|
||||
|
||||
import sys, os, re, tempfile, getopt
|
||||
from urllib2 import urlopen, HTTPError
|
||||
|
||||
# CONFIGURATION #################################################
|
||||
|
||||
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||
MAXFAIL = 3 # max number of retries if download fails
|
||||
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
||||
|
||||
# END CONFIGURATION ##############################################
|
||||
|
||||
wikiindex = "/index.php?title="
|
||||
|
||||
def update(pagename=None):
|
||||
|
||||
if not os.path.exists("revisions.txt"): # case 1)
|
||||
if not os.path.exists("wikifiles.txt"):
|
||||
print "No wikifiles.txt found. Aborting"
|
||||
sys.exit()
|
||||
pages = []
|
||||
f = open("wikifiles.txt","r")
|
||||
if VERBOSE: print "Reading existing list..."
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
if not "/wiki/" in l:
|
||||
if VERBOSE: print "Adding ",l.strip()
|
||||
pages.append(l.strip())
|
||||
f.close()
|
||||
if VERBOSE: print "Added ",str(len(pages))," entries"
|
||||
i = 1
|
||||
revs = []
|
||||
for page in pages:
|
||||
rev = getRevision(page)
|
||||
if VERBOSE: print str(i)," revision: ",rev
|
||||
revs.append(page+":"+rev)
|
||||
i += 1
|
||||
writeList(revs,"revisions.txt")
|
||||
print "All done. Successfully written revisions.txt with ",len(revs)," entries."
|
||||
|
||||
elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")): # case 2)
|
||||
f = open("revisions.txt","r")
|
||||
if VERBOSE: print "Reading revisions list..."
|
||||
revisions = {}
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
r = l.strip().split(":")
|
||||
p = ":".join(r[:-1])
|
||||
if VERBOSE: print "Adding ",p
|
||||
revisions[p] = r[1]
|
||||
f.close()
|
||||
if VERBOSE: print "Added ",str(len(revisions.keys()))," entries"
|
||||
updates = []
|
||||
i = 1
|
||||
for page in revisions.keys():
|
||||
rev = getRevision(page)
|
||||
if rev != revisions[page]:
|
||||
if VERBOSE: print str(i),page," has a new revision: ",rev
|
||||
updates.append(page)
|
||||
else:
|
||||
if VERBOSE: print str(i),page," is up to date "
|
||||
i += 1
|
||||
if updates:
|
||||
writeList(updates,"updates.txt")
|
||||
print "All done. Successfully written updates.txt with ",len(updates)," entries."
|
||||
else:
|
||||
print "Everything up to date. Nothing to be done."
|
||||
|
||||
elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"): # case 3)
|
||||
if not os.path.exists("wikifiles.txt"):
|
||||
print "No wikifiles.txt found. Aborting"
|
||||
sys.exit()
|
||||
wikifiles = []
|
||||
f = open("wikifiles.txt","r")
|
||||
if VERBOSE: print "Reading wikifiles list..."
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
wikifiles.append(l.strip())
|
||||
f.close()
|
||||
if VERBOSE: print "Read ",str(len(wikifiles))," entries"
|
||||
f = open("revisions.txt","r")
|
||||
if VERBOSE: print "Reading revisions list..."
|
||||
revisions = {}
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
r = l.strip().split(":")
|
||||
p = ":".join(r[:-1])
|
||||
revisions[p] = r[1]
|
||||
f.close()
|
||||
todo = []
|
||||
f = open("updates.txt","r")
|
||||
if VERBOSE: print "Reading updates list..."
|
||||
for l in f.readlines():
|
||||
if l.strip() != "":
|
||||
todo.append(l.strip())
|
||||
f.close()
|
||||
if VERBOSE: print str(len(todo))," pages to scan..."
|
||||
import buildwikiindex
|
||||
buildwikiindex.WRITETHROUGH = False
|
||||
buildwikiindex.VERBOSE = VERBOSE
|
||||
updates = []
|
||||
for t in todo:
|
||||
if VERBOSE: print "Scanning ",t
|
||||
updates.extend(buildwikiindex.crawl(t))
|
||||
updates = [u for u in updates if not u in wikifiles]
|
||||
if VERBOSE: print str(len(updates))," files to download..."
|
||||
import downloadwiki
|
||||
i = 1
|
||||
for u in updates:
|
||||
if VERBOSE: print i, ": Fetching ", u
|
||||
downloadwiki.get(u)
|
||||
if not "/wiki/" in u:
|
||||
rev = getRevision(u)
|
||||
revisions[u] = rev
|
||||
if not u in wikifiles:
|
||||
wikifiles.append(u)
|
||||
i += 1
|
||||
if VERBOSE: print "Updating wikifiles and revisions..."
|
||||
writeList(wikifiles,"wikifiles.txt")
|
||||
updatedrevs = []
|
||||
for k in revisions.keys():
|
||||
updatedrevs.append(k+":"+revisions[k])
|
||||
writeList(updatedrevs,"revisions.txt")
|
||||
os.remove("updates.txt")
|
||||
if VERBOSE: print "All done!"
|
||||
|
||||
def getRevision(page):
|
||||
html = fetchPage(page)
|
||||
revs = re.findall("wgCurRevisionId\"\:(.*?),",html)
|
||||
if len(revs) == 1:
|
||||
return revs[0]
|
||||
print 'Error: unable to get revision ID of ' + page
|
||||
sys.exit()
|
||||
|
||||
def fetchPage(page):
|
||||
"retrieves given page from the wiki"
|
||||
print "fetching: ",page
|
||||
failcount = 0
|
||||
while failcount < MAXFAIL:
|
||||
try:
|
||||
html = (urlopen(URL + wikiindex + page).read())
|
||||
return html
|
||||
except HTTPError:
|
||||
failcount += 1
|
||||
print 'Error: unable to fetch page ' + page
|
||||
sys.exit()
|
||||
|
||||
def writeList(pages,filename):
|
||||
f = open(filename,"wb")
|
||||
for p in pages:
|
||||
f.write(p+"\n")
|
||||
f.close()
|
||||
if VERBOSE: print "written ",filename
|
||||
|
||||
if __name__ == "__main__":
|
||||
update(sys.argv[1:])
|
||||
Reference in New Issue
Block a user