User:Broc/update table FEB24.py

From Wikipedia, the free encyclopedia
import requests
import re
import time
from bs4 import BeautifulSoup
from datetime import datetime

import pywikibot

def download_from_wikipedia(page_title):
    site = pywikibot.Site('en', 'wikipedia')  # Change 'en' to the language code of the Wikipedia you're working with
    page = pywikibot.Page(site, page_title)
    content = page.text
    return content

def upload_to_wikipedia(page_title, modified_text):
    site = pywikibot.Site('en', 'wikipedia')  # Change 'en' to the language code of the Wikipedia you're working with
    page = pywikibot.Page(site, page_title)
    
    # Edit the page with the modified text
    page.text = modified_text
    page.save("Updated leaderboard using pywikibot")


def get_n1_value(username, session):
    # Construct the URL for the sigma tool
    url = f'https://sigma.toolforge.org/summary.py?name={username}&search=FEB24&max=500&server=enwiki&ns=Main&enddate=20240201&startdate='

    # Send a GET request to the URL using the session
    response = session.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all unordered lists on the page
        ul_elements = soup.find_all('ul')

        # Check if there is a second <ul>
        if len(ul_elements) >= 2:
            second_ul_element = ul_elements[1]

            # Find all list items in the second <ul>
            li_elements = second_ul_element.find_all('li')

            # List to store unique PAGENAMEs
            unique_pagename_list = []

            # Iterate through list items
            for li_element in li_elements:
                # Find all <a> elements within the <li>
                a_elements = li_element.find_all('a')

                # Check if there are at least 4 <a> elements
                if len(a_elements) >= 4:
                    # Get the URL in the 4th <a> element
                    pagename_url = a_elements[3].get('href', '')

                    # Extract PAGENAME from the URL (you may need to adjust this based on the actual URL structure)
                    pagename_match = re.search(r'/wiki/([^?&#]+)', pagename_url)
                    if pagename_match:
                        pagename = pagename_match.group(1)

                        # Find the <i> tag following the last <a> tag
                        i_tag = a_elements[-1].find_next('i')

                        # Check if "feb24" is contained in the <i> tag and "feb24review" is not
                        if i_tag and 'feb24' in i_tag.get_text().lower() and 'feb24review' not in i_tag.get_text().lower():
                            unique_pagename_list.append(pagename)

            # Print the list of PAGENAMEs per user
            #print(f"User: {username}, PAGENAMEs: {unique_pagename_list}")

            # Count the number of unique PAGENAMEs
            n1_value = len(set(unique_pagename_list))
            return n1_value, unique_pagename_list

    # If the request was not successful or no second <ul> was found, return None
    return None, None

def parse_table(text, review_text):
    # Create a session for making requests
    session = requests.Session()

    # Keep track of processed usernames to avoid duplicate processing
    processed_usernames = set()

    # Find the starting index of the table
    start_index = text.find('{| class="wikitable sortable"')

    # Check if the table exists in the text
    if start_index != -1:
        # Find the ending index of the table
        end_index = text.find('|}', start_index)

        # Extract the table content
        table_content = text[start_index:end_index]

        # Split the table content into lines
        lines = table_content.split('\n')

        # Iterate through lines
        for i, line in enumerate(lines):
            if '![[User:' in line:
                # Extract username from the line
                username = re.search(r'\[\[User:([^]]+)', line).group(1)

                # Check if the username has already been processed
                if username not in processed_usernames:
                    # Get N1 value using the external function
                    n1_value, _ = get_n1_value(username, session)
                    n2_value = count_reviews_in_section(review_text, username)

                    if n1_value is not None:
                        # Two lines after the username
                        n1_line_index = i + 2

                        # One more line after N1
                        n2_line_index = i + 3

                        if n1_line_index < len(lines):
                            # Update the N1 value in the line
                            lines[n1_line_index] = f"| {n1_value}"
                            lines[n2_line_index] = f"| {n2_value/2}".rstrip('0').rstrip('.')

                            # Debug prints
                            print(f"User: {username}")
                            print(f"New N1 line: {lines[n1_line_index]}")
                            print(f"New N2 line: {lines[n2_line_index]}")

                            # Mark the username as processed
                            processed_usernames.add(username)

                            # Introduce a delay between calls (adjust as needed)
                            time.sleep(0.1)  # 0.1-second delay

        # Join the lines back together
        modified_table = '\n'.join(lines)

        # Replace the original table with the modified one
        text = text[:start_index] + modified_table + text[end_index:]

        # Define the regex pattern to match the date string
        date_pattern = re.compile(r'Last updated ([A-Za-z]+ \d{1,2}, \d{1,2}:\d{2} UTC\.)')

        # Find the date string in the content
        match = date_pattern.search(text)
        print(match)
        
        if match:
            # Extract the matched date string
            old_date_string = match.group(0)
            
            # Get the current timestamp in the same format
            current_timestamp = datetime.utcnow().strftime('Last updated %B %d, %H:%M UTC.')
            print(current_timestamp)
            # Replace the old date string with the current timestamp
            text = text.replace(old_date_string, current_timestamp)
        # Save the modified content to a new file
        with open('updated_table.txt', 'w') as new_file:
            new_file.write(text)

def count_reviews_in_section(file_content, section_name):
    lines = file_content.split('\n')

    in_target_section = False
    item_count = 0

    for line in lines:
        section_match = re.match(r'^==\s*([^=]+)\s*==$', line)
        if section_match:
            current_section = section_match.group(1).strip()
            in_target_section = (current_section == section_name)
        elif in_target_section and line.startswith('#') and not line.startswith('#:'):
            item_count += 1
    return item_count


if __name__ == "__main__":
    wikipedia_page_title = "User:BaranBOT/FEB24DriveLeaderboard"  # Replace with the title of the Wikipedia page you want to edit
    content = download_from_wikipedia(wikipedia_page_title)
    
    reviews_page = "Wikipedia:WikiProject_Unreferenced_articles/Backlog_drives/February 2024/Reviews"
    reviews_text = download_from_wikipedia(reviews_page)
        
    parse_table(content, reviews_text)
        
    with open('updated_table.txt', 'r') as file:
        updated_content = file.read()
        upload_to_wikipedia(wikipedia_page_title, updated_content)