136 lines
5.2 KiB
Python
136 lines
5.2 KiB
Python
"""
|
|
Author: PH01L
|
|
Email: phoil@osrsbox.com
|
|
Website: https://www.osrsbox.com
|
|
|
|
Copyright (c) 2019, PH01L
|
|
|
|
###############################################################################
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
###############################################################################
|
|
"""
|
|
from base64 import decode
|
|
import json
|
|
import logging
|
|
from pprint import pprint
|
|
import aiofiles
|
|
from pathlib import Path
|
|
from typing import TypeVar
|
|
|
|
import config
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
SelfWikiPageText = TypeVar("SelfWikiPageText", bound="WikiPageText")
|
|
|
|
|
|
class WikiPageText:
|
|
"""This class handles extraction of wiki text using an OSRS Wiki API query.
|
|
|
|
:param base_url: The OSRS Wiki URL used for API queries.
|
|
:param page_title: OSRS Wiki page titles used for API query.
|
|
"""
|
|
def __init__(self, base_url: str, page_title: str):
|
|
self.base_url = base_url
|
|
self.page_title = page_title
|
|
self.wiki_text = None
|
|
|
|
async def extract_page_wiki_text(self, session) -> SelfWikiPageText:
|
|
"""Extract wiki text from OSRS Wiki for a provided page name.
|
|
|
|
This function uses the class attributes as input to query the OSRS Wiki
|
|
API and extract the wiki text for a specific page. The page to query is
|
|
determined by the page title.
|
|
"""
|
|
request = {
|
|
"action": "parse",
|
|
"prop": "wikitext",
|
|
"format": "json",
|
|
"page": self.page_title
|
|
}
|
|
|
|
pprint(f"Processing: {self.page_title}")
|
|
# Perform HTTP GET request
|
|
async with session.get(self.base_url, headers=config.custom_agent, params=request) as resp:
|
|
try:
|
|
data = await resp.json()
|
|
except json.decoder.JSONDecodeError:
|
|
pprint(f"Didn't get anything useful: {await resp.text()}")
|
|
|
|
try:
|
|
# Try to extract the wiki text from the HTTP response
|
|
wiki_text = data["parse"]["wikitext"]["*"]
|
|
except KeyError:
|
|
# Set to None if wiki text extraction failed
|
|
wiki_text = None
|
|
|
|
self.wiki_text = wiki_text
|
|
return self
|
|
|
|
async def async_export_wiki_text_to_json(self, out_file_name: str):
|
|
"""Export all extracted wiki text to a JSON file.
|
|
|
|
Querying the OSRS Wiki constantly is a bad approach. This function writes any
|
|
extracted wiki text to a file to save re-querying the API. This function
|
|
attempts to overwrite pre-existing wiki text entry in a file, where the key
|
|
is the page title, and the value is the wiki text.
|
|
|
|
:param out_file_name: The file name to save wiki text to.
|
|
"""
|
|
# Create dictionary for export
|
|
json_data = {self.page_title: str(self.wiki_text)}
|
|
|
|
out_file_name = Path(out_file_name)
|
|
|
|
pprint(f"{json_data}")
|
|
# Write dictionary to JSON file
|
|
if not out_file_name.exists():
|
|
async with aiofiles.open(out_file_name, mode='w') as out_file:
|
|
await out_file.write(json.dumps(json_data, indent=4))
|
|
else:
|
|
async with aiofiles.open(out_file_name) as feeds_json:
|
|
contents = await feeds_json.read()
|
|
try:
|
|
feeds = json.loads(contents)
|
|
except json.decoder.json.JSONDecodeError:
|
|
pprint(f"{self.page_title} - {self.wiki_text}")
|
|
|
|
feeds[self.page_title] = str(self.wiki_text)
|
|
async with open(out_file_name, mode='w') as out_file:
|
|
await out_file.write(json.dumps(feeds, indent=4))
|
|
|
|
def export_wiki_text_to_json(self, out_file_name: str):
|
|
"""Export all extracted wiki text to a JSON file.
|
|
|
|
Querying the OSRS Wiki constantly is a bad approach. This function writes any
|
|
extracted wiki text to a file to save re-querying the API. This function
|
|
attempts to overwrite pre-existing wiki text entry in a file, where the key
|
|
is the page title, and the value is the wiki text.
|
|
|
|
:param out_file_name: The file name to save wiki text to.
|
|
"""
|
|
# Create dictionary for export
|
|
json_data = {self.page_title: str(self.wiki_text)}
|
|
|
|
out_file_name = Path(out_file_name)
|
|
|
|
pprint(f"Writing {self.page_title}")
|
|
# Write dictionary to JSON file
|
|
if not out_file_name.exists():
|
|
with open(out_file_name, mode='w') as out_file:
|
|
out_file.write(json.dumps(json_data, indent=4))
|
|
else:
|
|
with open(out_file_name) as feeds_json:
|
|
feeds = json.load(feeds_json)
|
|
feeds[self.page_title] = str(self.wiki_text)
|
|
with open(out_file_name, mode='w') as out_file:
|
|
out_file.write(json.dumps(feeds, indent=4))
|