User:Zzuuzz/scripts/bad image check.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) 2023 User:zzuuzz at English Wikipedia
"""
A script to check a MediaWiki bad image list for problems.

The main checks are:
    Formatting of list items
    Missing / redirected pages and files
    Uses without a listed exception

Notes:
    This script prints output to the terminal and doesn't make any changes.
    Command line arguments are not supported.
    Should work on most MediaWiki / Wikipedias

Requires:
    python 3.6 +
    install pywikibot module

Usage:
    python3 <script name>, or maybe just: python <script name>
    pwb <script name>

"""

import re
import sys
from typing import Dict, List, Set

import pywikibot
import pywikibot.data.api


# Configurable options:

SITE = "wikipedia:en"  # Format like "wikipedia:en", "meta", or "wikidata"
BIL_PAGE = "MediaWiki:Bad image list"  # Wiki page title
# Get an old revision by oldid; use False (or 0, None, "", etc) for latest:
# OLD_ID = 1065475922
OLD_ID = 0

# Pretty output flags:
FAIL = "\033[91mFAIL\033[m:"
INFO = "\033[94mINFO\033[m:"
SUCCESS = "\033[32mOK\033[m:"


# Here be dragons...


class BadImageListItem:
    def __init__(self, linenum, image, exceptions) -> None:
        self.linenum = linenum
        self.title = image
        self.exceptions = exceptions

    @property
    def link(self) -> pywikibot.Link:
        return pywikibot.Link(self.title, site)

    @property
    def norm_title(self) -> str:
        return pywikibot.Link(self.title, site).canonical_title()


class BadImageFileInfo:
    def __init__(self, data: dict) -> None:
        self.data = data
        self.missing = "missing" in data

    @property
    def file_missing(self) -> bool:
        if "imageinfo" in self.data:
            for item in self.data["imageinfo"]:
                if "filemissing" in item:
                    return True
        return False

    @property
    def is_local_image(self) -> bool:
        if "imagerepository" in self.data:
            return self.data["imagerepository"] == "local"
        return False

    @property
    def is_redirect(self) -> bool:
        return self.data["title"] != self.target_canonical_title

    @property
    def target_canonical_title(self) -> str:
        if "imageinfo" in self.data:
            for revision in self.data["imageinfo"]:
                if "canonicaltitle" in revision:
                    return revision["canonicaltitle"]
        return ""

    @property
    def title(self) -> str:
        return self.data["title"]

    @property
    def usage(self) -> Set[str]:
        result: Set[str] = set()
        if "fileusage" in self.data:
            for item in self.data["fileusage"]:
                if "title" in item:
                    result.add(item["title"])
        return result


def load_fileinfo(filenames: List[str]) -> Dict[str, BadImageFileInfo]:
    result: Dict[str, BadImageFileInfo] = dict()
    batchsize = 50  # API has a normal lower request limit of 50 pages.
    for i in range(0, len(filenames), batchsize):
        end = i + batchsize
        progress = int(len(result) / len(filenames) * 100)
        print(f"\033[KGetting info ... {progress}%\r", end="")
        qry_args = {
            "fuprop": "title|redirect",
            "iilimit": 1,
            "iiprop": "badfile|canonicaltitle",
            "titles": filenames[i:end],
        }
        qry_result = pywikibot.data.api.PropertyGenerator(
            prop="imageinfo|fileusage", site=site, parameters=qry_args
        )
        for pagedata in qry_result:
            result[pagedata["title"]] = BadImageFileInfo(pagedata)
    print("\033[K\r", end="")  # clear rolling status
    return result


site = pywikibot.Site(SITE)
print(f"{INFO} Checking bad image list for {site.sitename}")
bil_page = pywikibot.Page(site, BIL_PAGE)
if not bil_page.exists():
    sys.exit(f"No list found at {bil_page}")
if OLD_ID:
    bil_lines = bil_page.getOldVersion(OLD_ID).splitlines()
else:
    bil_lines = bil_page.text.splitlines()
if not bil_lines:
    sys.exit("Empty list")

image_by_line: Dict[int, BadImageListItem] = dict()
image_by_name: Dict[str, List[BadImageListItem]] = dict()
line_num: int = 0
fatal_line_errors: List[int] = []
duplicates: Set[str] = set()
fileinfo: Dict[str, BadImageFileInfo] = dict()

# Build data dictionary
for line in bil_lines:
    line_num += 1
    if len(line) > 0 and line[0] == "*":
        links = re.findall(r"\[\[:?([^\]]*)\]\]", line)
        if links:
            entry = BadImageListItem(line_num, links[0], links[1:])
            image_by_line[line_num] = entry
            # Add dup detection
            if entry.norm_title not in image_by_name:
                image_by_name[entry.norm_title] = []
            image_by_name[entry.norm_title].append(entry)

if not image_by_line:
    sys.exit("No entries found")

# Check list problems - piped links, namespace, duplicates
print(f"{INFO} Checking for namespace and link errors")
for line_num, bil in image_by_line.items():
    if bil.link.anchor:
        print(f"{FAIL} -> Error: Piped link: {bil.title} [{line_num}]")
        fatal_line_errors.append(line_num)
    if bil.link.namespace != site.namespaces.FILE:
        print(f"{FAIL} -> Error: Wrong namespace: {bil.title} [{line_num}]")
        fatal_line_errors.append(line_num)
    # Add extra dup detection processing
    if len(image_by_name[bil.norm_title]) > 1:
        duplicates.add(bil.norm_title)
for line_num in fatal_line_errors:
    del image_by_name[image_by_line[line_num].norm_title]
    del image_by_line[line_num]

# Check duplicate file names
if duplicates:
    print(f"{FAIL} {len(duplicates)} Duplicate file names found:")
    for s in sorted(duplicates):
        ln = [str(bil.linenum) for bil in image_by_name[s]]
        print(f"-> {s} [{', '.join(ln)}]")
else:
    print(f"{SUCCESS} No duplicate file names found")

# Normalize file names
for line, bil in image_by_line.items():
    if bil.title != bil.norm_title:
        msg = f"{INFO} Normalizable: {bil.title}"
        msg += f" -> {bil.norm_title} [{line}]"
        print(msg)

# Load file and exception info
print(f"{INFO} Checking file info")
fileinfo = load_fileinfo(list(image_by_name.keys()))

# Check for missing files
print(f"{INFO} Checking for missing files")
redlinks: List[BadImageFileInfo] = []
filemissing: List[BadImageFileInfo] = []

for info in fileinfo.values():
    if info.missing and info.file_missing:
        redlinks.append(info)
    elif info.file_missing:
        filemissing.append(info)

if redlinks:
    print(f"{FAIL} {len(redlinks)} Red links found:")
    for info in redlinks:
        ln = [str(bil.linenum) for bil in image_by_name[info.title]]
        print(f"-> {info.title} [{', '.join(ln)}]")
else:
    print(f"{SUCCESS} No red links found")

if filemissing:
    print(f"{FAIL} {len(filemissing)} Missing files (deleted on commons):")
    for info in filemissing:
        ln = [str(bil.linenum) for bil in image_by_name[info.title]]
        print(f"-> {info.title} [{', '.join(ln)}]")
else:
    print(f"{SUCCESS} No other missing files found")

# Check for local and unlisted commons redirects
print(f"{INFO} Checking for redirects")
local_redirects: List[BadImageFileInfo] = []
unlisted_commons_redirs: List[BadImageFileInfo] = []

for info in fileinfo.values():
    if info.is_redirect and info.is_local_image:
        local_redirects.append(info)
    elif info.is_redirect and not info.is_local_image:
        if info.target_canonical_title not in fileinfo:
            unlisted_commons_redirs.append(info)

if local_redirects:
    print(f"{FAIL} {len(local_redirects)} Local redirects found:")
    for info in local_redirects:
        ln = [str(bil.linenum) for bil in image_by_name[info.title]]
        msg = f"-> {info.title} <- redirects to -> "
        msg += f"{info.target_canonical_title} [{', '.join(ln)}]"
        print(msg)
else:
    print(f"{SUCCESS} No local redirects found")

additions: Set[str] = set()
if unlisted_commons_redirs:
    print(f"{FAIL} {len(unlisted_commons_redirs)} Unlisted commons redirects:")
    for info in unlisted_commons_redirs:
        ln = [str(bil.linenum) for bil in image_by_name[info.title]]
        msg = f"-> {info.title} <- redirects to -> "
        msg += f"{info.target_canonical_title} [{', '.join(ln)}]"
        additions.add(info.target_canonical_title)
        print(msg)
else:
    print(f"{SUCCESS} No unlisted commons redirects found")

if additions:
    print(f"{INFO} {len(additions)} Possible additions for commons redirects:")
    newinfo = load_fileinfo(list(additions))
    sorted_additions = []
    for k, info in newinfo.items():
        current_uses: Set[str] = set()
        for info2 in fileinfo.values():
            if info.title == info2.target_canonical_title:
                current_uses = info2.usage
                break
        msg = f"* [[:{k}]]"
        first_sort_by_name = sorted(info.usage.union(current_uses))
        if first_sort_by_name:
            sorted_exceptions = sorted(
                first_sort_by_name,
                key=lambda x: pywikibot.Page(site, title=x).namespace().id,
            )
            msg += f" except on [[{']], [['.join(sorted_exceptions)}]]"
        sorted_additions.append(msg)
    print("\n".join(sorted(sorted_additions)))

# Usage / Exceptions
print(f"{INFO} Checking usage and exceptions")
exc: Dict[str, Dict[str, Set[str]]] = dict()
used_unexcepted: List[str] = []
for k, bil_list in image_by_name.items():
    if k not in exc:
        exc[k] = dict()
        exc[k]["usage"] = set()
        exc[k]["exceptions"] = set()
        for bil in bil_list:
            exc[k]["exceptions"].update(bil.exceptions)
for k, info in fileinfo.items():
    if k not in exc:
        exc[k] = dict()
        exc[k]["usage"] = set()
        exc[k]["exceptions"] = set()
    exc[k]["usage"] = info.usage

for k, v in exc.items():
    if v["usage"] - v["exceptions"]:
        used_unexcepted.append(k)
if used_unexcepted:
    print(f"{FAIL} Usage without exception found:")
    for s in used_unexcepted:
        ln = [str(bil.linenum) for bil in image_by_name[s]]
        msg = f"-> {s} <- used on -> "
        msg += f"{exc[s]['usage'] - exc[s]['exceptions']}"
        msg += f" [{', '.join(ln)}]"
        print(msg)
else:
    print(f"{SUCCESS} No usage without exception found")

#