Download raw (2.3 KB)
import wikipedia import json import os.path # wiki_html = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.HTML) wikipedia.set_lang("en") CACHE_PATH = 'cached_pages/' # --- cache system --- def json_parsing(NAME_PATH): jsn = "" with open(NAME_PATH, 'r') as file: jsn = json.load(file) return jsn def json_writing(NAME_PATH, data): with open(NAME_PATH, 'w') as file: # json_data = json.dumps(data, default=lambda o: o.__dict__, indent=4) # print(json_data) json.dump(data, file, indent=4, default=lambda o: o.__dict__) # --- wiki request --- def get_wikipage(pagename, lang = "en"): # get wikipedia page content by name of the page try: results = wikipedia.search(pagename, results=1, suggestion=False) try: pagename = results[0] except IndexError: # if there is no suggestion or search results, the page doesn't exist raise wikipedia.PageError(pagename) return wikipedia.WikipediaPage(pagename, redirect=True, preload=True) except wikipedia.exceptions.DisambiguationError as e: print(e.options) page = '' return page def request_page(request): # page = wikipedia.page(request) page = get_wikipage(request) #checking if the page exists # print("Page - Exists: %s" % page.exists()) #cleaning the html content we get from inline style p_html = page.html() p_html = p_html.split("<") p_clean = [] for t in p_html: if t.startswith("link") or t.startswith("style") or t == "": pass else: new_t = "<" + t p_clean.append(new_t) # adding the original request # and the content as custom attribute setattr(page, 'request', request) setattr(page, 'cleaned_content', ''.join(p_clean)) # add it to the cache json_writing(CACHE_PATH + request + '.json', page) return page def get_page(request): # # check if page exist in cache if os.path.isfile(CACHE_PATH + request + '.json'): print("--- FROM CACHE: " + request) page = json_parsing(CACHE_PATH + request + '.json') # # else request page through the module + API else: print("--- REQUEST: " + request) page = request_page(request) return page