Updated wiki download scripts
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -28,8 +28,7 @@ install_manifest.txt
|
|||||||
/ZERO_CHECK.dir/
|
/ZERO_CHECK.dir/
|
||||||
/build/
|
/build/
|
||||||
/src/Tools/offlinedoc/localwiki/
|
/src/Tools/offlinedoc/localwiki/
|
||||||
/src/Tools/offlinedoc/todolist.txt
|
/src/Tools/offlinedoc/*.txt
|
||||||
/src/Tools/offlinedoc/wikifiles.txt
|
|
||||||
OpenSCAD_rc.py
|
OpenSCAD_rc.py
|
||||||
.subuser-dev
|
.subuser-dev
|
||||||
/\.idea/
|
/\.idea/
|
||||||
|
|||||||
@@ -16,4 +16,13 @@ download and another to actually download the files.
|
|||||||
|
|
||||||
4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)
|
4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)
|
||||||
|
|
||||||
5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
|
5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
|
||||||
|
|
||||||
|
6) If you have already downloaded the whole wiki, run "update.py" immediately
|
||||||
|
after, to create a list of revision IDs for each page.
|
||||||
|
|
||||||
|
7) Once the initial revisions list has been created, the "update.py" script
|
||||||
|
can be ran anytime in the future, to check for pages that have changed
|
||||||
|
since the stored revision ID. The script is meant to run twice, one to get
|
||||||
|
a list of pages that have changed, and another one to download the changed
|
||||||
|
pages (and all their dependencies) again.
|
||||||
|
|||||||
@@ -36,9 +36,9 @@ from urllib2 import urlopen, HTTPError
|
|||||||
|
|
||||||
# CONFIGURATION #################################################
|
# CONFIGURATION #################################################
|
||||||
|
|
||||||
URL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
|
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||||
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
||||||
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
|
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
|
||||||
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||||
MAXFAIL = 3 # max number of retries if download fails
|
MAXFAIL = 3 # max number of retries if download fails
|
||||||
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
||||||
@@ -48,35 +48,37 @@ WRITETHROUGH = True # if true, fetched files are constantly written to disk, in
|
|||||||
|
|
||||||
wikiindex = "/index.php?title="
|
wikiindex = "/index.php?title="
|
||||||
|
|
||||||
def crawl(pagename):
|
def crawl(pagename=[]):
|
||||||
"downloads an entire wiki site"
|
"downloads an entire wiki site"
|
||||||
todolist = []
|
todolist = []
|
||||||
processed = []
|
processed = []
|
||||||
count = 1
|
count = 1
|
||||||
if os.path.exists("wikifiles.txt"):
|
if pagename:
|
||||||
f = open("wikifiles.txt","r")
|
if not isinstance(pagename,list):
|
||||||
if VERBOSE: print "Reading existing list..."
|
pagename = [pagename]
|
||||||
for l in f.readlines():
|
todolist = pagename
|
||||||
if l.strip() != "":
|
|
||||||
if VERBOSE: print "Adding ",l
|
|
||||||
processed.append(l.strip())
|
|
||||||
f.close()
|
|
||||||
if os.path.exists("todolist.txt"):
|
|
||||||
f = open("todolist.txt","r")
|
|
||||||
if VERBOSE: print "Reading existing todo list..."
|
|
||||||
for l in f.readlines():
|
|
||||||
if l.strip() != "":
|
|
||||||
todolist.append(l.strip())
|
|
||||||
f.close()
|
|
||||||
else:
|
else:
|
||||||
if pagename:
|
if os.path.exists("wikifiles.txt"):
|
||||||
todolist = pagename
|
f = open("wikifiles.txt","r")
|
||||||
|
if VERBOSE: print "Reading existing list..."
|
||||||
|
for l in f.readlines():
|
||||||
|
if l.strip() != "":
|
||||||
|
if VERBOSE: print "Adding ",l
|
||||||
|
processed.append(l.strip())
|
||||||
|
f.close()
|
||||||
|
if os.path.exists("todolist.txt"):
|
||||||
|
f = open("todolist.txt","r")
|
||||||
|
if VERBOSE: print "Reading existing todo list..."
|
||||||
|
for l in f.readlines():
|
||||||
|
if l.strip() != "":
|
||||||
|
todolist.append(l.strip())
|
||||||
|
f.close()
|
||||||
else:
|
else:
|
||||||
indexpages,imgs = get(INDEX)
|
indexpages,imgs = get(INDEX)
|
||||||
todolist.extend(indexpages)
|
todolist.extend(indexpages)
|
||||||
while todolist:
|
while todolist:
|
||||||
targetpage = todolist.pop()
|
targetpage = todolist.pop()
|
||||||
if not targetpage in NORETRIEVE:
|
if (not targetpage in NORETRIEVE):
|
||||||
if VERBOSE: print count, ": Scanning ", targetpage
|
if VERBOSE: print count, ": Scanning ", targetpage
|
||||||
pages,images = get(targetpage)
|
pages,images = get(targetpage)
|
||||||
count += 1
|
count += 1
|
||||||
@@ -92,6 +94,8 @@ def crawl(pagename):
|
|||||||
if VERBOSE: print "Fetched ", count, " pages"
|
if VERBOSE: print "Fetched ", count, " pages"
|
||||||
if not WRITETHROUGH:
|
if not WRITETHROUGH:
|
||||||
writeList(processed)
|
writeList(processed)
|
||||||
|
if pagename:
|
||||||
|
return processed
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get(page):
|
def get(page):
|
||||||
@@ -126,11 +130,16 @@ def cleanhtml(html):
|
|||||||
|
|
||||||
def getlinks(html):
|
def getlinks(html):
|
||||||
"returns a list of wikipage links in html file"
|
"returns a list of wikipage links in html file"
|
||||||
|
global NORETRIEVE
|
||||||
links = re.findall('<a[^>]*>.*?</a>',html)
|
links = re.findall('<a[^>]*>.*?</a>',html)
|
||||||
pages = []
|
pages = []
|
||||||
for l in links:
|
for l in links:
|
||||||
# rg = re.findall('php\?title=(.*)\" title',l)
|
# rg = re.findall('php\?title=(.*)\" title',l)
|
||||||
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
||||||
|
if not rg:
|
||||||
|
rg = re.findall('href="\/wiki\/(.*?)"',l)
|
||||||
|
if "images" in rg:
|
||||||
|
rg = None
|
||||||
if rg:
|
if rg:
|
||||||
rg = rg[0]
|
rg = rg[0]
|
||||||
if not "Command_Reference" in rg:
|
if not "Command_Reference" in rg:
|
||||||
@@ -138,21 +147,23 @@ def getlinks(html):
|
|||||||
rg = rg.split('#')[0]
|
rg = rg.split('#')[0]
|
||||||
if ":" in rg:
|
if ":" in rg:
|
||||||
NORETRIEVE.append(rg)
|
NORETRIEVE.append(rg)
|
||||||
if ";" in rg:
|
|
||||||
NORETRIEVE.append(rg)
|
|
||||||
if "&" in rg:
|
if "&" in rg:
|
||||||
NORETRIEVE.append(rg)
|
NORETRIEVE.append(rg)
|
||||||
|
if ";" in rg:
|
||||||
|
NORETRIEVE.append(rg)
|
||||||
if "/" in rg:
|
if "/" in rg:
|
||||||
if not GETTRANSLATIONS:
|
if not GETTRANSLATIONS:
|
||||||
NORETRIEVE.append(rg)
|
NORETRIEVE.append(rg)
|
||||||
pages.append(rg)
|
|
||||||
if not rg in NORETRIEVE:
|
if not rg in NORETRIEVE:
|
||||||
|
pages.append(rg)
|
||||||
print "got link: ",rg
|
print "got link: ",rg
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
def getimagelinks(html):
|
def getimagelinks(html):
|
||||||
"returns a list of image links found in an html file"
|
"returns a list of image links found in an html file"
|
||||||
return re.findall('<img.*?src="(.*?)"',html)
|
imlinks = re.findall('<img.*?src="(.*?)"',html)
|
||||||
|
imlinks = [l for l in imlinks if not l.startswith("http")] # remove external images
|
||||||
|
return imlinks
|
||||||
|
|
||||||
def fetchpage(page):
|
def fetchpage(page):
|
||||||
"retrieves given page from the wiki"
|
"retrieves given page from the wiki"
|
||||||
@@ -165,6 +176,7 @@ def fetchpage(page):
|
|||||||
except HTTPError:
|
except HTTPError:
|
||||||
failcount += 1
|
failcount += 1
|
||||||
print 'Error: unable to fetch page ' + page
|
print 'Error: unable to fetch page ' + page
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
def cleanList(pagelist):
|
def cleanList(pagelist):
|
||||||
"cleans the list"
|
"cleans the list"
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ from urllib2 import urlopen, HTTPError
|
|||||||
|
|
||||||
# CONFIGURATION #################################################
|
# CONFIGURATION #################################################
|
||||||
|
|
||||||
DEFAULTURL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
|
DEFAULTURL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||||
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
||||||
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
|
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
|
||||||
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||||
@@ -189,6 +189,8 @@ def getlinks(html):
|
|||||||
for l in links:
|
for l in links:
|
||||||
# rg = re.findall('php\?title=(.*)\" title',l)
|
# rg = re.findall('php\?title=(.*)\" title',l)
|
||||||
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
||||||
|
if not rg:
|
||||||
|
rg = re.findall('href="\/wiki\/(.*?)"',l)
|
||||||
if rg:
|
if rg:
|
||||||
rg = rg[0]
|
rg = rg[0]
|
||||||
if not "Command_Reference" in rg:
|
if not "Command_Reference" in rg:
|
||||||
|
|||||||
198
src/Tools/offlinedoc/update.py
Executable file
198
src/Tools/offlinedoc/update.py
Executable file
@@ -0,0 +1,198 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
#***************************************************************************
|
||||||
|
#* *
|
||||||
|
#* Copyright (c) 2017 Yorik van Havre <yorik@uncreated.net> *
|
||||||
|
#* *
|
||||||
|
#* This program is free software; you can redistribute it and/or modify *
|
||||||
|
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
||||||
|
#* as published by the Free Software Foundation; either version 2 of *
|
||||||
|
#* the License, or (at your option) any later version. *
|
||||||
|
#* for detail see the LICENCE text file. *
|
||||||
|
#* *
|
||||||
|
#* This program is distributed in the hope that it will be useful, *
|
||||||
|
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||||||
|
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||||
|
#* GNU Library General Public License for more details. *
|
||||||
|
#* *
|
||||||
|
#* You should have received a copy of the GNU Library General Public *
|
||||||
|
#* License along with this program; if not, write to the Free Software *
|
||||||
|
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
||||||
|
#* USA *
|
||||||
|
#* *
|
||||||
|
#***************************************************************************
|
||||||
|
|
||||||
|
__title__="update.py"
|
||||||
|
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
||||||
|
__url__ = "http://www.freecadweb.org"
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script needs to be ran after the wiki has been fully downloaded. It has three usages:
|
||||||
|
|
||||||
|
1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file
|
||||||
|
and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file
|
||||||
|
|
||||||
|
2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of
|
||||||
|
wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt.
|
||||||
|
An update.txt file is created with all pages that have different revision IDs
|
||||||
|
|
||||||
|
3) If update.txt exists, each entry of it will be scanned again for new links and all the needed
|
||||||
|
files downloaded. Revision.txt and wikifiles.txt get also updated.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys, os, re, tempfile, getopt
|
||||||
|
from urllib2 import urlopen, HTTPError
|
||||||
|
|
||||||
|
# CONFIGURATION #################################################
|
||||||
|
|
||||||
|
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
||||||
|
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
||||||
|
MAXFAIL = 3 # max number of retries if download fails
|
||||||
|
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
||||||
|
|
||||||
|
# END CONFIGURATION ##############################################
|
||||||
|
|
||||||
|
wikiindex = "/index.php?title="
|
||||||
|
|
||||||
|
def update(pagename=None):
|
||||||
|
|
||||||
|
if not os.path.exists("revisions.txt"): # case 1)
|
||||||
|
if not os.path.exists("wikifiles.txt"):
|
||||||
|
print "No wikifiles.txt found. Aborting"
|
||||||
|
sys.exit()
|
||||||
|
pages = []
|
||||||
|
f = open("wikifiles.txt","r")
|
||||||
|
if VERBOSE: print "Reading existing list..."
|
||||||
|
for l in f.readlines():
|
||||||
|
if l.strip() != "":
|
||||||
|
if not "/wiki/" in l:
|
||||||
|
if VERBOSE: print "Adding ",l.strip()
|
||||||
|
pages.append(l.strip())
|
||||||
|
f.close()
|
||||||
|
if VERBOSE: print "Added ",str(len(pages))," entries"
|
||||||
|
i = 1
|
||||||
|
revs = []
|
||||||
|
for page in pages:
|
||||||
|
rev = getRevision(page)
|
||||||
|
if VERBOSE: print str(i)," revision: ",rev
|
||||||
|
revs.append(page+":"+rev)
|
||||||
|
i += 1
|
||||||
|
writeList(revs,"revisions.txt")
|
||||||
|
print "All done. Successfully written revisions.txt with ",len(revs)," entries."
|
||||||
|
|
||||||
|
elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")): # case 2)
|
||||||
|
f = open("revisions.txt","r")
|
||||||
|
if VERBOSE: print "Reading revisions list..."
|
||||||
|
revisions = {}
|
||||||
|
for l in f.readlines():
|
||||||
|
if l.strip() != "":
|
||||||
|
r = l.strip().split(":")
|
||||||
|
p = ":".join(r[:-1])
|
||||||
|
if VERBOSE: print "Adding ",p
|
||||||
|
revisions[p] = r[1]
|
||||||
|
f.close()
|
||||||
|
if VERBOSE: print "Added ",str(len(revisions.keys()))," entries"
|
||||||
|
updates = []
|
||||||
|
i = 1
|
||||||
|
for page in revisions.keys():
|
||||||
|
rev = getRevision(page)
|
||||||
|
if rev != revisions[page]:
|
||||||
|
if VERBOSE: print str(i),page," has a new revision: ",rev
|
||||||
|
updates.append(page)
|
||||||
|
else:
|
||||||
|
if VERBOSE: print str(i),page," is up to date "
|
||||||
|
i += 1
|
||||||
|
if updates:
|
||||||
|
writeList(updates,"updates.txt")
|
||||||
|
print "All done. Successfully written updates.txt with ",len(updates)," entries."
|
||||||
|
else:
|
||||||
|
print "Everything up to date. Nothing to be done."
|
||||||
|
|
||||||
|
elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"): # case 3)
|
||||||
|
if not os.path.exists("wikifiles.txt"):
|
||||||
|
print "No wikifiles.txt found. Aborting"
|
||||||
|
sys.exit()
|
||||||
|
wikifiles = []
|
||||||
|
f = open("wikifiles.txt","r")
|
||||||
|
if VERBOSE: print "Reading wikifiles list..."
|
||||||
|
for l in f.readlines():
|
||||||
|
if l.strip() != "":
|
||||||
|
wikifiles.append(l.strip())
|
||||||
|
f.close()
|
||||||
|
if VERBOSE: print "Read ",str(len(wikifiles))," entries"
|
||||||
|
f = open("revisions.txt","r")
|
||||||
|
if VERBOSE: print "Reading revisions list..."
|
||||||
|
revisions = {}
|
||||||
|
for l in f.readlines():
|
||||||
|
if l.strip() != "":
|
||||||
|
r = l.strip().split(":")
|
||||||
|
p = ":".join(r[:-1])
|
||||||
|
revisions[p] = r[1]
|
||||||
|
f.close()
|
||||||
|
todo = []
|
||||||
|
f = open("updates.txt","r")
|
||||||
|
if VERBOSE: print "Reading updates list..."
|
||||||
|
for l in f.readlines():
|
||||||
|
if l.strip() != "":
|
||||||
|
todo.append(l.strip())
|
||||||
|
f.close()
|
||||||
|
if VERBOSE: print str(len(todo))," pages to scan..."
|
||||||
|
import buildwikiindex
|
||||||
|
buildwikiindex.WRITETHROUGH = False
|
||||||
|
buildwikiindex.VERBOSE = VERBOSE
|
||||||
|
updates = []
|
||||||
|
for t in todo:
|
||||||
|
if VERBOSE: print "Scanning ",t
|
||||||
|
updates.extend(buildwikiindex.crawl(t))
|
||||||
|
updates = [u for u in updates if not u in wikifiles]
|
||||||
|
if VERBOSE: print str(len(updates))," files to download..."
|
||||||
|
import downloadwiki
|
||||||
|
i = 1
|
||||||
|
for u in updates:
|
||||||
|
if VERBOSE: print i, ": Fetching ", u
|
||||||
|
downloadwiki.get(u)
|
||||||
|
if not "/wiki/" in u:
|
||||||
|
rev = getRevision(u)
|
||||||
|
revisions[u] = rev
|
||||||
|
if not u in wikifiles:
|
||||||
|
wikifiles.append(u)
|
||||||
|
i += 1
|
||||||
|
if VERBOSE: print "Updating wikifiles and revisions..."
|
||||||
|
writeList(wikifiles,"wikifiles.txt")
|
||||||
|
updatedrevs = []
|
||||||
|
for k in revisions.keys():
|
||||||
|
updatedrevs.append(k+":"+revisions[k])
|
||||||
|
writeList(updatedrevs,"revisions.txt")
|
||||||
|
os.remove("updates.txt")
|
||||||
|
if VERBOSE: print "All done!"
|
||||||
|
|
||||||
|
def getRevision(page):
|
||||||
|
html = fetchPage(page)
|
||||||
|
revs = re.findall("wgCurRevisionId\"\:(.*?),",html)
|
||||||
|
if len(revs) == 1:
|
||||||
|
return revs[0]
|
||||||
|
print 'Error: unable to get revision ID of ' + page
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
def fetchPage(page):
|
||||||
|
"retrieves given page from the wiki"
|
||||||
|
print "fetching: ",page
|
||||||
|
failcount = 0
|
||||||
|
while failcount < MAXFAIL:
|
||||||
|
try:
|
||||||
|
html = (urlopen(URL + wikiindex + page).read())
|
||||||
|
return html
|
||||||
|
except HTTPError:
|
||||||
|
failcount += 1
|
||||||
|
print 'Error: unable to fetch page ' + page
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
def writeList(pages,filename):
|
||||||
|
f = open(filename,"wb")
|
||||||
|
for p in pages:
|
||||||
|
f.write(p+"\n")
|
||||||
|
f.close()
|
||||||
|
if VERBOSE: print "written ",filename
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
update(sys.argv[1:])
|
||||||
Reference in New Issue
Block a user