Download raw (2.3 KB)
import wikipedia
import json
import os.path
# wiki_html = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.HTML)
wikipedia.set_lang("en")
CACHE_PATH = 'cached_pages/'
# --- cache system ---
def json_parsing(NAME_PATH):
jsn = ""
with open(NAME_PATH, 'r') as file:
jsn = json.load(file)
return jsn
def json_writing(NAME_PATH, data):
with open(NAME_PATH, 'w') as file:
# json_data = json.dumps(data, default=lambda o: o.__dict__, indent=4)
# print(json_data)
json.dump(data, file, indent=4, default=lambda o: o.__dict__)
# --- wiki request ---
def get_wikipage(pagename, lang = "en"):
# get wikipedia page content by name of the page
try:
results = wikipedia.search(pagename, results=1, suggestion=False)
try:
pagename = results[0]
except IndexError:
# if there is no suggestion or search results, the page doesn't exist
raise wikipedia.PageError(pagename)
return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
except wikipedia.exceptions.DisambiguationError as e:
print(e.options)
page = ''
return page
def request_page(request):
# page = wikipedia.page(request)
page = get_wikipage(request)
#checking if the page exists
# print("Page - Exists: %s" % page.exists())
#cleaning the html content we get from inline style
p_html = page.html()
p_html = p_html.split("<")
p_clean = []
for t in p_html:
if t.startswith("link") or t.startswith("style") or t == "":
pass
else:
new_t = "<" + t
p_clean.append(new_t)
# adding the original request
# and the content as custom attribute
setattr(page, 'request', request)
setattr(page, 'cleaned_content', ''.join(p_clean))
# add it to the cache
json_writing(CACHE_PATH + request + '.json', page)
return page
def get_page(request):
# # check if page exist in cache
if os.path.isfile(CACHE_PATH + request + '.json'):
print("--- FROM CACHE: " + request)
page = json_parsing(CACHE_PATH + request + '.json')
# # else request page through the module + API
else:
print("--- REQUEST: " + request)
page = request_page(request)
return page