I HATE MONOGATARI METADATA

10 months ago · 04b91277c2
parent c5aadb59fd
commit 04b91277c2
3 changed files with 243 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 my-venv
+monogatari_nfo
--- a/monogatari_metadata_fetcher.py
+++ b/monogatari_metadata_fetcher.py
@ -0,0 +1,241 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+import xml.etree.ElementTree as ET
+from datetime import datetime
+import time
+
+# List of Monogatari seasons with their AniDB IDs
+SEASONS = [
+    {"name": "Bakemonogatari", "id": 6327},
+    {"name": "Nisemonogatari", "id": 8658},
+    {"name": "Nekomonogatari Kuro", "id": 9453},
+    {"name": "Monogatari: Second Season", "id": 9183},
+    {"name": "Hanamonogatari", "id": 10046},
+    {"name": "Tsukimonogatari", "id": 10891},
+    {"name": "Owarimonogatari", "id": 11350},
+    {"name": "Koyomimonogatari", "id": 11827},
+    {"name": "Kizumonogatari", "id": 8357},
+    {"name": "Zoku Owarimonogatari", "id": 13691}
+]
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 mf2389hf890328-fh37h7fd32h7"
+}
+
+def fetch_episode_details(episode_id):
+    """Fetch detailed information for a given episode."""
+    episode_url = f"https://anidb.net/episode/{episode_id}"
+    print(f"[FETCH] {episode_url}")
+    
+    try:
+        res = requests.get(episode_url, headers=HEADERS)
+        res.raise_for_status()
+    except Exception as e:
+        print(f"[ERROR] Failed to fetch episode {episode_id}: {e}")
+        return None
+
+    soup = BeautifulSoup(res.text, "html.parser")
+    
+    # Find the table with episode details inside the div with class 'g_definitionlist'
+    table = soup.find("div", class_="g_definitionlist").find("table")
+    if not table:
+        print(f"[WARN] Table with episode details not found for episode {episode_id}.")
+        return None
+
+    episode_data = {}
+
+    # Extract the episode details from the table
+    for row in table.find("tbody").find_all("tr"):
+        try:
+            if "romaji" in row.get("class", []):  # Episode title (romaji)
+                episode_data["title_romaji"] = row.find("td").find("span").text.strip()
+
+            elif "official" in row.get("class", []):  # Original title
+                episode_data["title_en"] = row.find("td").find("label").text.strip()
+
+            elif "date" in row.get("class", []):  # Air date
+                air_date = row.find("td").find("span").text.strip()
+                episode_data["air_date"] = datetime.strptime(air_date, "%d.%m.%Y").strftime("%Y-%m-%d")
+
+            elif "rating" in row.get("class", []):  # Rating
+                episode_data["rating"] = row.find("td").find("span", class_="value").text.strip()
+
+        except Exception as e:
+            print(f"[SKIP] Error parsing row: {e}")
+            continue
+
+    # Get the AniDB episode ID for this episode
+    episode_data["anidbid"] = episode_id
+    
+    # Fetch the plot from the div with class "summary"
+    summary_div = soup.find("div", class_="summary")
+    if summary_div:
+        plot = ""
+        # Extract the text before the first two <br> tags
+        br_tags = summary_div.find_all("br")
+        if len(br_tags) >= 2:
+            plot = "".join(str(summary_div.contents[i]) for i in range(summary_div.contents.index(br_tags[0])))
+        else:
+            plot = summary_div.text.strip()
+        episode_data["plot"] = plot.strip()
+    else:
+        episode_data["plot"] = "No plot available"
+
+    return episode_data
+
+def fetch_episodes(season_id):
+    """Fetch episodes for a specific season based on its AniDB ID and hardcoded table item ranges."""
+    season_ranges = {
+        6327: range(1, 16),  # Bakemonogatari - first 15 items
+        8658: range(1, 12),  # Nisemonogatari - first 11 items
+        9453: range(2, 6),   # Nekomonogatari Kuro - items 2-5
+        9183: [i for i in range(1, 27) if i not in [6, 11, 16]],  # Monogatari: Second Season - items 1-26, excluding 6, 11, and 16
+        10046: range(2, 7),  # Hanamonogatari - items 2-6
+        10891: range(2, 6),  # Tsukimonogatari - items 2-5
+        11350: range(2, 13), # Owarimonogatari - items 2-12, starting at episode 2
+        11827: range(1, 13), # Koyomimonogatari - first 12 items
+        8357: range(1, 4),   # Kizumonogatari - first 3 items
+        13691: range(2, 8),  # Zoku Owarimonogatari - items 2-7
+    }
+
+    base_url = f"https://anidb.net/anime/{season_id}"
+    print(f"[FETCH] {base_url}")
+    try:
+        res = requests.get(base_url, headers=HEADERS)
+        res.raise_for_status()
+    except Exception as e:
+        print(f"[ERROR] Failed to fetch page: {e}")
+        return []
+
+    soup = BeautifulSoup(res.text, "html.parser")
+    table = soup.find("table", id="eplist", class_="eplist")
+    if not table:
+        print("[WARN] Episode table not found.")
+        return []
+
+    tbody = table.find("tbody")
+    if not tbody:
+        print("[WARN] Table body not found.")
+        return []
+
+    episodes = []
+    rows = tbody.find_all("tr")
+
+    # Get the specific range of rows based on the season_id
+    season_range = season_ranges.get(season_id)
+    if not season_range:
+        print(f"[ERROR] No episode range found for season ID {season_id}.")
+        return []
+
+    episode_number = 1  # Start episode count from 1
+    for idx, row in enumerate(rows):
+        # Only process rows within the specified range for the season
+        if idx + 1 not in season_range:
+            continue
+
+        try:
+            ep_id_cell = row.find("td", class_="id eid")
+            title_cell = row.find("td", class_="title name episode")
+            if not ep_id_cell or not title_cell:
+                continue
+
+            # Get episode ID from the href of the <a> tag
+            episode_link = ep_id_cell.find("a")["href"]
+            episode_id = episode_link.split("/")[-1]  # Get the episode ID
+
+            episode_details = fetch_episode_details(episode_id)
+            time.sleep(0.5)
+            if not episode_details:
+                continue
+
+            # For Owarimonogatari, start counting from episode 2
+            if season_id == 11350 and episode_number == 1:
+                episode_number += 1  # Skip episode 1
+
+            episodes.append({
+                "number": episode_number,
+                "title_en": episode_details.get("title_en", "Unknown Title"),
+                "title_romaji": episode_details.get("title_romaji", "Unknown Romaji"),
+                "plot": episode_details.get("plot", "No plot available"),
+                "air_date": episode_details.get("air_date", "Unknown Date"),
+                "rating": episode_details.get("rating", "Unknown Rating"),
+                "year": episode_details["air_date"][:4],  # Get the year from the air date (YYYY)
+                "anidbid": episode_details["anidbid"]  # Include anidbid
+            })
+
+            # Increment episode number after each episode
+            episode_number += 1
+
+        except Exception as e:
+            print(f"[SKIP] Row parse error: {e}")
+            continue
+
+    return episodes
+
+
+def pretty_print_xml(element):
+    """Pretty print XML with newlines and indentation."""
+    tree = ET.ElementTree(element)
+    from io import BytesIO
+    import xml.dom.minidom
+
+    rough_string = BytesIO()
+    tree.write(rough_string, encoding="utf-8", xml_declaration=True)
+    rough_string.seek(0)
+    dom = xml.dom.minidom.parse(rough_string)
+    return dom.toprettyxml(indent="  ")
+
+def write_nfo(episode, season_name, season_number):
+    """Writes an NFO file for a given episode."""
+    # Check if anidbid is missing
+    if "anidbid" not in episode:
+        print(f"[WARN] Missing anidbid for episode {episode['number']}. Skipping this episode.")
+        return
+
+    # Use season_number to create the folder
+    season_folder = f"Season {season_number:02}"  # Format as 'Season 01', 'Season 02', etc.
+    season_dir = os.path.join("monogatari_nfo", season_folder)
+    os.makedirs(season_dir, exist_ok=True)
+    
+    file_name = f"S{season_number:02}E{episode['number']:02}.nfo"
+    file_path = os.path.join(season_dir, file_name)
+
+    root = ET.Element("episodedetails")
+    ET.SubElement(root, "plot").text = episode["plot"]
+    ET.SubElement(root, "originaltitle").text = episode["title_en"]
+    ET.SubElement(root, "title").text = episode["title_romaji"]
+    ET.SubElement(root, "year").text = episode["year"]
+    ET.SubElement(root, "aired").text = episode["air_date"]
+    ET.SubElement(root, "rating").text = episode["rating"]
+    ET.SubElement(root, "anidbid").text = episode["anidbid"]
+    ET.SubElement(root, "episode").text = str(episode["number"])
+    ET.SubElement(root, "season").text = str(season_number)
+
+    pretty_xml = pretty_print_xml(root)
+
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(pretty_xml)
+    
+    print(f"[WRITE] {file_path}")
+
+
+def main():
+    for season_number, season in enumerate(SEASONS, start=1):
+        season_name = season["name"]
+        season_id = season["id"]
+        print(f"\n[INFO] Scraping episodes for {season_name} (ID: {season_id})...")
+
+        episodes = fetch_episodes(season_id)
+        if not episodes:
+            print(f"[FAIL] No episodes found for {season_name}.")
+            continue
+
+        # Write .nfo files for each episode in the current season
+        for episode in episodes:
+            write_nfo(episode, season_name, season_number)
+
+    print("\n✅ Done generating .nfo files for all Monogatari seasons.")
+
+if __name__ == "__main__":
+    main()
--- a/replace_nfo.py
+++ b/replace_nfo.py
@ -41,6 +41,7 @@ def process_files(source_dir, target_dir):
            continue

        for tgt_file in matching_targets:
+            print(f"Updating file: {tgt_file}")
            target_tree = parse_xml(tgt_file)
            if target_tree is None:
                continue