""" Author: PH01L Email: phoil@osrsbox.com Website: https://www.osrsbox.com Description: Script to fetch OSRS Wiki pages for Category:Monsters. Copyright (c) 2021, PH01L ############################################################################### This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ############################################################################### """ import asyncio from aiohttp_retry import RetryClient, JitterRetry from tqdm.asyncio import tqdm import os from pprint import pprint import sys import json import itertools import collections from pathlib import Path from datetime import datetime from datetime import timedelta from typing import List import aiohttp import config from scripts.wiki.wiki_page_titles import WikiPageTitles from scripts.wiki.wiki_page_text import WikiPageText from scripts.wiki.wikitext_parser import WikitextIDParser OSRS_WIKI_API_URL = "https://oldschool.runescape.wiki/api.php" TITLES_FP = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-titles.json") TEXT_FP = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-text.json") def fetch(): """Get all the wiki category page titles and page text.""" # Try to determine the last update if TITLES_FP.exists(): stream = os.popen(f"git log -1 --format='%ad' {TITLES_FP}") last_extraction_date = stream.read() last_extraction_date = last_extraction_date.strip() last_extraction_date = last_extraction_date.replace(" -0500", "") try: last_extraction_date = datetime.strptime(last_extraction_date, "%a %b %d %H:%M:%S %Y") last_extraction_date = last_extraction_date - timedelta(days=3) except: last_extraction_date = datetime.strptime("2013-02-22", "%Y-%m-%d") else: last_extraction_date = datetime.strptime("2013-02-22", "%Y-%m-%d") print(">>> Starting wiki page titles extraction...") # Create object to handle page titles extraction wiki_page_titles = WikiPageTitles(OSRS_WIKI_API_URL, ["Monsters"]) # Boolean to trigger load page titles from file, or run fresh page title extraction load_files = False # Load previously extracted page titles from JSON, or extract from OSRS Wiki API if load_files: loaded_page_titles = wiki_page_titles.load_page_titles(TITLES_FP) if not loaded_page_titles: sys.exit(">>> ERROR: Specified page titles to load, but not file found. Exiting.") else: # Extract page titles using supplied categories wiki_page_titles.extract_page_titles() # Extract page revision date # Loop 50 page titles at a time, the max number for a revisions request using page titles for page_title_list in itertools.zip_longest(*[iter(wiki_page_titles.page_titles)] * 50): # Remove None entries from the list of page titles page_title_list = filter(None, page_title_list) # Join the page titles list using the pipe (|) separator page_titles_string = "|".join(page_title_list) # Extract the page revision date wiki_page_titles.extract_last_revision_timestamp(page_titles_string) # Save all page titles and wiki_page_titles.export_page_titles_in_json(TITLES_FP) # Determine page titles count page_titles_total = len(wiki_page_titles) print(f">>> Number of extracted wiki pages: {page_titles_total}") # Open page title JSON file, to check if page needs to have wiki text extracted _process_monster_properties(wiki_page_titles) def _process_monster_properties(a): loop = asyncio.get_event_loop() loop.run_until_complete(_inner_process_monster_properties(a)) async def _inner_process_monster_properties(wiki_page_titles): pprint(">>> Starting wiki text extraction for extracted page titles...") json_data = dict() conn = aiohttp.TCPConnector(limit = 20) timeout = aiohttp.ClientTimeout( total=5 * 60, connect=60, sock_connect=5, sock_read=5 ) retry_options = JitterRetry( attempts=5, exceptions={aiohttp.client_exceptions.ServerTimeoutError}, start_timeout=1.5, ) retry_client = RetryClient( retry_options=retry_options, connector=conn, timeout=timeout, raise_for_status=False, ) async with retry_client as session: tasks = [] for page_title, _ in wiki_page_titles.page_titles.items(): # If script fails: # 1) Set load_files (above) to True # 2) Uncomment code below, and set item ID to last failed item # 3) Use this script: python monster_properties_fetch -c Monsters # if int(page_titles_count) < 5400: # page_titles_count += 1 # continue # Check if page title is already present in JSON output file, also check revision date if page_title in json_data: # If the last revision was before last extract, skip continue # Create object to extract page wiki text wiki_page_text = WikiPageText(OSRS_WIKI_API_URL, page_title) # If the page title has not been extracted, extract wiki text and save to JSON file tasks.append(asyncio.ensure_future(wiki_page_text.extract_page_wiki_text(session))) d: List[WikiPageText] = await tqdm.gather(*tasks) for item in d: item.export_wiki_text_to_json(TEXT_FP) # tasks = [] # for item in d: # tasks.append(asyncio.ensure_future(item.async_export_wiki_text_to_json(TEXT_FP))) # await asyncio.gather(*tasks) def process(): print(">>> Starting wiki page text processing...") # Call WikitextIDParser to map: # 1. ID to infobox template version # 2. ID to wikitext entry template_names = ["infobox monster"] wiki_data_ids = WikitextIDParser(TEXT_FP, template_names) wiki_data_ids.process_osrswiki_data_dump() WikiEntry = collections.namedtuple('WikiEntry', 'wiki_page_name version_number wikitext') export = dict() for item_id, wikitext in wiki_data_ids.item_id_to_wikitext.items(): entry = WikiEntry(wiki_page_name=wiki_data_ids.item_id_to_wiki_name[item_id], version_number=wiki_data_ids.item_id_to_version_number[item_id], wikitext=wikitext) export[item_id] = entry out_fi = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-text-processed.json") with open(out_fi, 'w') as f: json.dump(export, f, indent=4) if __name__ == "__main__": fetch() process()