osrsbox-db/scripts/wiki/wiki_page_titles.py

225 lines
9.2 KiB
Python

"""
Author: PH01L
Email: phoil@osrsbox.com
Website: https://www.osrsbox.com
Copyright (c) 2019, PH01L
###############################################################################
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
###############################################################################
"""
import json
import logging
import os
from pathlib import Path
from typing import Dict
from typing import Generator
import requests
import config
LOG = logging.getLogger(__name__)
class WikiPageTitles:
"""This class handles extraction of wiki page titles by category using an OSRS Wiki API query.
:param base_url: The OSRS Wiki URL used for API queries.
:param categories: A list of OSRS Wiki categories.
"""
def __init__(self, base_url: str, categories: list):
self.base_url = base_url
self.categories = categories
self.page_titles: Dict[str, str] = dict()
def __iter__(self) -> Generator[str, None, None]:
"""Iterate (loop) over the extracted or loaded OSRS Wiki page titles.
:return: An extracted page title from the OSRS Wiki for a specific category.
"""
for page_title in self.page_titles:
yield page_title
def __len__(self) -> int:
"""Return the length of the extract or loaded OSRS Wiki page titles.
:return: The number of extracted or loaded page titles.
"""
return len(self.page_titles)
def __getitem__(self, key: str) -> str:
"""Return the revision date of the provided page title.
:param key: The page title to extract the revision date from.
:return last_revision_date: The date the page was last modified in a string ISO8601 format.
:rtype str:
"""
return self.page_titles[key]
def load_page_titles(self, in_file_name: str) -> bool:
"""Load a JSON file of OSRS Wiki page titles that have previously been extracted.
If you already have a recently dumped JSON file of OSRS Wiki page titles you
can load them using this function. This saves performing additional wiki API
queries when you have an up-to-date list of page titles.
:param in_file_name: The name of the file to load page titles from.
:return: A boolean to indicate if a file with page titles was loaded correctly.
"""
if not Path.exists(in_file_name):
return False
with open(in_file_name) as input_json_file:
self.page_titles = json.load(input_json_file)
return True
def extract_page_titles(self):
"""Query a list of categories in the OSRS Wiki and return a list of page titles.
This function is used to loop the list of categories that you want to extract
from the OSRS Wiki using the MediaWiki API. You can all it using one category,
for example: `Items`. Or you can use a list of category strings, for example:
`Items, Pets, Furniture`.
"""
for category in self.categories:
self.extract_page_titles_from_category(category)
def extract_page_titles_from_category(self, category: str):
"""Query a specific category in the OSRS Wiki and populate a list of page tiles.
A function to extract all page titles from a provided OSRS Wiki category. The
extracted page title is useful to perform additional API queries, such as
extracting wiki text or revision timestamps.
:param category: A string representing the OSRS Wiki category to extract.
"""
# Start construct MediaWiki request
request = {'list': 'categorymembers'}
for result in self._extract_page_titles_from_category_callback(request, category):
# Process JSON result data
for entry in result['categorymembers']:
page_title = entry["title"]
if page_title.startswith("File:"):
continue
if page_title.startswith("Category:"):
continue
# Log the page title, and append to list
self.page_titles[page_title] = None
def _extract_page_titles_from_category_callback(self, request: Dict, category: str):
"""Query callback function for OSRS Wiki category query.
A callback function for using MediaWiki generators. Since the category query is a
list function, you can use a generator to continue queries when the returned data
is longer than the maximum returned query.
:param request: A dictionary to be populated with the OSRS Wiki API request.
:param category: A string representing the OSRS Wiki category to extract.
"""
request['cmtitle'] = f'Category:{category}'
request['action'] = 'query'
request['format'] = 'json'
request['cmlimit'] = '500'
last_continue = {}
while True:
# Clone original request
req = request.copy()
# Insert the 'continue' section to the request
req.update(last_continue)
# Perform HTTP GET request
try:
result = requests.get(self.base_url,
headers=config.custom_agent,
params=req).json()
except requests.exceptions.RequestException as e:
raise SystemExit(">>> ERROR: Get request error. Exiting.") from e
# Handle HTTP response
if 'query' in result:
# If "query" entry is in JSON result, extract the query response
yield result['query']
if 'continue' not in result:
# If "continue" entry is not JSON result, there are no more page titles in category
break
if 'errors' in result:
print(result['errors'])
break
if 'warnings' in result:
print(result['warnings'])
break
# Update the last fetched page title to continue query
last_continue = result['continue']
def extract_last_revision_timestamp(self, page_titles_string: str) -> Dict:
"""Extract the last revision timestamp for page titles from OSRS Wiki.
The MediaWiki API used by the OSRS Wiki has the functionality to return the
last revision date of a specific page. This function extracts the revision
date for a list of extracted page titles. This value can be used to determine
if changes have recently been made to the page, and if the page should be
processed again. The input page_titles_string needs to be a list of page
titles separated by the pipe (|) character. The maximum number of page titles
is 50 per API query.
:param page_titles_string: A string of pipe separated wiki page titles.
:return pages_revision_data:
"""
# Construct query for fetching page revisions
request = {
'action': 'query',
'prop': 'revisions',
'titles': page_titles_string,
'format': 'json',
'rvprop': 'timestamp'
}
page_data = requests.get(self.base_url,
headers=config.custom_agent,
params=request).json()
# Loop returned page revision data
pages_revision_data = page_data["query"]["pages"]
for page_id in pages_revision_data:
# Extract page title from the response
page_title = pages_revision_data[page_id]["title"]
# Extract last revision timestamp (ISO 8601 format)
page_revision_date = pages_revision_data[page_id]["revisions"][0]["timestamp"]
# Add revision date to page_titles dict
self.page_titles[page_title] = page_revision_date
return pages_revision_data
def export_page_titles_in_json(self, out_file_name: str):
"""Export all extracted page titles and revision timestamp to a JSON file.
:param out_file_name: The file name used for exporting the wiki page titles to JSON.
"""
os.makedirs(os.path.dirname(out_file_name), exist_ok=True)
with open(out_file_name, mode='w') as out_file:
out_file.write(json.dumps(self.page_titles, indent=4))
def export_page_titles_in_text(self, out_file_name: str):
"""Export all extracted page titles from a category to a text file.
:param out_file_name: The file name used for exporting the wiki page titles to a text file.
"""
with open(out_file_name, mode='w', newline='\n') as out_file:
for page_title in self.page_titles:
out_file.write(page_title + '\n')