osrsbox-db/scripts/monsters/monsters_properties.py

188 lines
7.1 KiB
Python

"""
Author: PH01L
Email: phoil@osrsbox.com
Website: https://www.osrsbox.com
Description:
Script to fetch OSRS Wiki pages for Category:Monsters.
Copyright (c) 2021, PH01L
###############################################################################
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
###############################################################################
"""
import asyncio
from aiohttp_retry import RetryClient, JitterRetry
from tqdm.asyncio import tqdm
import os
from pprint import pprint
import sys
import json
import itertools
import collections
from pathlib import Path
from datetime import datetime
from datetime import timedelta
from typing import List
import aiohttp
import config
from scripts.wiki.wiki_page_titles import WikiPageTitles
from scripts.wiki.wiki_page_text import WikiPageText
from scripts.wiki.wikitext_parser import WikitextIDParser
OSRS_WIKI_API_URL = "https://oldschool.runescape.wiki/api.php"
TITLES_FP = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-titles.json")
TEXT_FP = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-text.json")
def fetch():
"""Get all the wiki category page titles and page text."""
# Try to determine the last update
if TITLES_FP.exists():
stream = os.popen(f"git log -1 --format='%ad' {TITLES_FP}")
last_extraction_date = stream.read()
last_extraction_date = last_extraction_date.strip()
last_extraction_date = last_extraction_date.replace(" -0500", "")
try:
last_extraction_date = datetime.strptime(last_extraction_date, "%a %b %d %H:%M:%S %Y")
last_extraction_date = last_extraction_date - timedelta(days=3)
except:
last_extraction_date = datetime.strptime("2013-02-22", "%Y-%m-%d")
else:
last_extraction_date = datetime.strptime("2013-02-22", "%Y-%m-%d")
print(">>> Starting wiki page titles extraction...")
# Create object to handle page titles extraction
wiki_page_titles = WikiPageTitles(OSRS_WIKI_API_URL,
["Monsters"])
# Boolean to trigger load page titles from file, or run fresh page title extraction
load_files = False
# Load previously extracted page titles from JSON, or extract from OSRS Wiki API
if load_files:
loaded_page_titles = wiki_page_titles.load_page_titles(TITLES_FP)
if not loaded_page_titles:
sys.exit(">>> ERROR: Specified page titles to load, but not file found. Exiting.")
else:
# Extract page titles using supplied categories
wiki_page_titles.extract_page_titles()
# Extract page revision date
# Loop 50 page titles at a time, the max number for a revisions request using page titles
for page_title_list in itertools.zip_longest(*[iter(wiki_page_titles.page_titles)] * 50):
# Remove None entries from the list of page titles
page_title_list = filter(None, page_title_list)
# Join the page titles list using the pipe (|) separator
page_titles_string = "|".join(page_title_list)
# Extract the page revision date
wiki_page_titles.extract_last_revision_timestamp(page_titles_string)
# Save all page titles and
wiki_page_titles.export_page_titles_in_json(TITLES_FP)
# Determine page titles count
page_titles_total = len(wiki_page_titles)
print(f">>> Number of extracted wiki pages: {page_titles_total}")
# Open page title JSON file, to check if page needs to have wiki text extracted
_process_monster_properties(wiki_page_titles)
def _process_monster_properties(a):
loop = asyncio.get_event_loop()
loop.run_until_complete(_inner_process_monster_properties(a))
async def _inner_process_monster_properties(wiki_page_titles):
pprint(">>> Starting wiki text extraction for extracted page titles...")
json_data = dict()
conn = aiohttp.TCPConnector(limit = 20)
timeout = aiohttp.ClientTimeout(
total=5 * 60, connect=60, sock_connect=5, sock_read=5
)
retry_options = JitterRetry(
attempts=5,
exceptions={aiohttp.client_exceptions.ServerTimeoutError},
start_timeout=1.5,
)
retry_client = RetryClient(
retry_options=retry_options,
connector=conn,
timeout=timeout,
raise_for_status=False,
)
async with retry_client as session:
tasks = []
for page_title, _ in wiki_page_titles.page_titles.items():
# If script fails:
# 1) Set load_files (above) to True
# 2) Uncomment code below, and set item ID to last failed item
# 3) Use this script: python monster_properties_fetch -c Monsters
# if int(page_titles_count) < 5400:
# page_titles_count += 1
# continue
# Check if page title is already present in JSON output file, also check revision date
if page_title in json_data:
# If the last revision was before last extract, skip
continue
# Create object to extract page wiki text
wiki_page_text = WikiPageText(OSRS_WIKI_API_URL,
page_title)
# If the page title has not been extracted, extract wiki text and save to JSON file
tasks.append(asyncio.ensure_future(wiki_page_text.extract_page_wiki_text(session)))
d: List[WikiPageText] = await tqdm.gather(*tasks)
for item in d:
item.export_wiki_text_to_json(TEXT_FP)
# tasks = []
# for item in d:
# tasks.append(asyncio.ensure_future(item.async_export_wiki_text_to_json(TEXT_FP)))
# await asyncio.gather(*tasks)
def process():
print(">>> Starting wiki page text processing...")
# Call WikitextIDParser to map:
# 1. ID to infobox template version
# 2. ID to wikitext entry
template_names = ["infobox monster"]
wiki_data_ids = WikitextIDParser(TEXT_FP, template_names)
wiki_data_ids.process_osrswiki_data_dump()
WikiEntry = collections.namedtuple('WikiEntry', 'wiki_page_name version_number wikitext')
export = dict()
for item_id, wikitext in wiki_data_ids.item_id_to_wikitext.items():
entry = WikiEntry(wiki_page_name=wiki_data_ids.item_id_to_wiki_name[item_id],
version_number=wiki_data_ids.item_id_to_version_number[item_id],
wikitext=wikitext)
export[item_id] = entry
out_fi = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-text-processed.json")
with open(out_fi, 'w') as f:
json.dump(export, f, indent=4)
if __name__ == "__main__":
fetch()
process()