188 lines
7.1 KiB
Python
188 lines
7.1 KiB
Python
"""
|
|
Author: PH01L
|
|
Email: phoil@osrsbox.com
|
|
Website: https://www.osrsbox.com
|
|
|
|
Description:
|
|
Script to fetch OSRS Wiki pages for Category:Monsters.
|
|
|
|
Copyright (c) 2021, PH01L
|
|
|
|
###############################################################################
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
###############################################################################
|
|
"""
|
|
|
|
import asyncio
|
|
from aiohttp_retry import RetryClient, JitterRetry
|
|
from tqdm.asyncio import tqdm
|
|
import os
|
|
from pprint import pprint
|
|
import sys
|
|
import json
|
|
import itertools
|
|
import collections
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from datetime import timedelta
|
|
from typing import List
|
|
import aiohttp
|
|
|
|
import config
|
|
from scripts.wiki.wiki_page_titles import WikiPageTitles
|
|
from scripts.wiki.wiki_page_text import WikiPageText
|
|
from scripts.wiki.wikitext_parser import WikitextIDParser
|
|
|
|
|
|
OSRS_WIKI_API_URL = "https://oldschool.runescape.wiki/api.php"
|
|
TITLES_FP = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-titles.json")
|
|
TEXT_FP = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-text.json")
|
|
|
|
|
|
def fetch():
|
|
"""Get all the wiki category page titles and page text."""
|
|
# Try to determine the last update
|
|
if TITLES_FP.exists():
|
|
stream = os.popen(f"git log -1 --format='%ad' {TITLES_FP}")
|
|
last_extraction_date = stream.read()
|
|
last_extraction_date = last_extraction_date.strip()
|
|
last_extraction_date = last_extraction_date.replace(" -0500", "")
|
|
try:
|
|
last_extraction_date = datetime.strptime(last_extraction_date, "%a %b %d %H:%M:%S %Y")
|
|
last_extraction_date = last_extraction_date - timedelta(days=3)
|
|
except:
|
|
last_extraction_date = datetime.strptime("2013-02-22", "%Y-%m-%d")
|
|
else:
|
|
last_extraction_date = datetime.strptime("2013-02-22", "%Y-%m-%d")
|
|
|
|
print(">>> Starting wiki page titles extraction...")
|
|
# Create object to handle page titles extraction
|
|
wiki_page_titles = WikiPageTitles(OSRS_WIKI_API_URL,
|
|
["Monsters"])
|
|
|
|
# Boolean to trigger load page titles from file, or run fresh page title extraction
|
|
load_files = False
|
|
|
|
# Load previously extracted page titles from JSON, or extract from OSRS Wiki API
|
|
if load_files:
|
|
loaded_page_titles = wiki_page_titles.load_page_titles(TITLES_FP)
|
|
if not loaded_page_titles:
|
|
sys.exit(">>> ERROR: Specified page titles to load, but not file found. Exiting.")
|
|
else:
|
|
# Extract page titles using supplied categories
|
|
wiki_page_titles.extract_page_titles()
|
|
# Extract page revision date
|
|
# Loop 50 page titles at a time, the max number for a revisions request using page titles
|
|
for page_title_list in itertools.zip_longest(*[iter(wiki_page_titles.page_titles)] * 50):
|
|
# Remove None entries from the list of page titles
|
|
page_title_list = filter(None, page_title_list)
|
|
# Join the page titles list using the pipe (|) separator
|
|
page_titles_string = "|".join(page_title_list)
|
|
# Extract the page revision date
|
|
wiki_page_titles.extract_last_revision_timestamp(page_titles_string)
|
|
# Save all page titles and
|
|
wiki_page_titles.export_page_titles_in_json(TITLES_FP)
|
|
|
|
# Determine page titles count
|
|
page_titles_total = len(wiki_page_titles)
|
|
print(f">>> Number of extracted wiki pages: {page_titles_total}")
|
|
|
|
# Open page title JSON file, to check if page needs to have wiki text extracted
|
|
|
|
_process_monster_properties(wiki_page_titles)
|
|
|
|
|
|
def _process_monster_properties(a):
|
|
loop = asyncio.get_event_loop()
|
|
loop.run_until_complete(_inner_process_monster_properties(a))
|
|
|
|
|
|
async def _inner_process_monster_properties(wiki_page_titles):
|
|
pprint(">>> Starting wiki text extraction for extracted page titles...")
|
|
json_data = dict()
|
|
|
|
conn = aiohttp.TCPConnector(limit = 20)
|
|
timeout = aiohttp.ClientTimeout(
|
|
total=5 * 60, connect=60, sock_connect=5, sock_read=5
|
|
)
|
|
retry_options = JitterRetry(
|
|
attempts=5,
|
|
exceptions={aiohttp.client_exceptions.ServerTimeoutError},
|
|
start_timeout=1.5,
|
|
)
|
|
retry_client = RetryClient(
|
|
retry_options=retry_options,
|
|
connector=conn,
|
|
timeout=timeout,
|
|
raise_for_status=False,
|
|
)
|
|
async with retry_client as session:
|
|
tasks = []
|
|
for page_title, _ in wiki_page_titles.page_titles.items():
|
|
# If script fails:
|
|
# 1) Set load_files (above) to True
|
|
# 2) Uncomment code below, and set item ID to last failed item
|
|
# 3) Use this script: python monster_properties_fetch -c Monsters
|
|
# if int(page_titles_count) < 5400:
|
|
# page_titles_count += 1
|
|
# continue
|
|
|
|
# Check if page title is already present in JSON output file, also check revision date
|
|
if page_title in json_data:
|
|
# If the last revision was before last extract, skip
|
|
continue
|
|
|
|
# Create object to extract page wiki text
|
|
wiki_page_text = WikiPageText(OSRS_WIKI_API_URL,
|
|
page_title)
|
|
|
|
# If the page title has not been extracted, extract wiki text and save to JSON file
|
|
tasks.append(asyncio.ensure_future(wiki_page_text.extract_page_wiki_text(session)))
|
|
|
|
d: List[WikiPageText] = await tqdm.gather(*tasks)
|
|
|
|
for item in d:
|
|
item.export_wiki_text_to_json(TEXT_FP)
|
|
|
|
# tasks = []
|
|
# for item in d:
|
|
# tasks.append(asyncio.ensure_future(item.async_export_wiki_text_to_json(TEXT_FP)))
|
|
|
|
# await asyncio.gather(*tasks)
|
|
|
|
def process():
|
|
print(">>> Starting wiki page text processing...")
|
|
# Call WikitextIDParser to map:
|
|
# 1. ID to infobox template version
|
|
# 2. ID to wikitext entry
|
|
template_names = ["infobox monster"]
|
|
wiki_data_ids = WikitextIDParser(TEXT_FP, template_names)
|
|
wiki_data_ids.process_osrswiki_data_dump()
|
|
|
|
WikiEntry = collections.namedtuple('WikiEntry', 'wiki_page_name version_number wikitext')
|
|
|
|
export = dict()
|
|
|
|
for item_id, wikitext in wiki_data_ids.item_id_to_wikitext.items():
|
|
entry = WikiEntry(wiki_page_name=wiki_data_ids.item_id_to_wiki_name[item_id],
|
|
version_number=wiki_data_ids.item_id_to_version_number[item_id],
|
|
wikitext=wikitext)
|
|
export[item_id] = entry
|
|
|
|
out_fi = Path(config.DATA_MONSTERS_PATH / "monsters-wiki-page-text-processed.json")
|
|
with open(out_fi, 'w') as f:
|
|
json.dump(export, f, indent=4)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
fetch()
|
|
process()
|