optimized

2026-03-07 09:23:52 +01:00
parent d343a36b97
commit 8efbace2ec
4 changed files with 420 additions and 82 deletions
--- a/ids.py
+++ b/ids.py
@@ -1,8 +1,6 @@
 import requests
 import re
 from bs4 import BeautifulSoup
-import pandas as pd
-import numpy as np
 #from hallen import *
 NameTabelle=0
 Alias=1
@@ -125,6 +123,19 @@ hallen.append(["06034", "Bicken."])


 link_nächsteSpiele = "https://hnr-handball.liga.nu/cgi-bin/WebObjects/nuLigaHBDE.woa/wa/clubMeetings?searchType=0&searchTimeRange=5&searchTimeRangeFrom=&searchTimeRangeTo=&selectedTeamId=WONoSelectionString&club=74726&searchMeetings=Suchen"
+REQUEST_TIMEOUT = 20
+REQUEST_SESSION = requests.Session()
+WHITESPACE_RE = re.compile(r"(\s{2,})")
+POKAL_RE = re.compile(r".\-(\w*)\s.*Jgd")
+LEAGUE_CLASS_RE = re.compile(
+    r"\b(Regionalliga|Regionsoberliga|Regionsliga|Verbandsliga|Oberliga|Landesliga|"
+    r"Bezirksliga|Kreisliga|Kreisliga|Kreisklasse)(?:\s+\d+)?\b",
+    re.IGNORECASE,
+)
+REGION_SEASON_PREFIX_RE = re.compile(
+    r"^[A-Za-zÄÖÜäöüß]+(?:/[A-Za-zÄÖÜäöüß]+)+\s+\d{4}/\d{2}\s*",
+    re.IGNORECASE,
+)

 def halle(nummer):
    for x in range(len(hallen)):
@@ -137,29 +148,27 @@ def halle(nummer):

 def tabelle(team):
    #print(team[Alias] +'  '+ team[LigaLink])
-    data=fetch_table(team[LigaLink], 0)
-    for x in range(len(data)):
-        data[x]=data[x][1:10]
-    return(data)
+    data = fetch_table(team[LigaLink], 0)
+    return [row[1:10] for row in data]

 def getliga(team):
-    try:
-        response = requests.get(team[LigaLink])
-    except:
-        print(team)
-    Fi =str(response.content.decode('utf-8', 'ignore'))
-    Fi_cleaned = re.sub(r'(\s{2,})',' ',str(Fi))
-    soup = BeautifulSoup(Fi_cleaned, 'html.parser')
+    soup = fetch_soup(team[LigaLink])
    abschnitt = soup.find(id="content-col1")
-    #print("abschnitt")
-    #print(abschnitt)
-    parse=re.compile(r"br\/>\s(.*)\s\<br/>")
-    name=str(parse.findall(str(abschnitt))[0])
-    #print("Name")
-    #print(name)
-    if len(name) > 20:
-        if re.search(r'((.*)\s)\<br/>',str(name)) is not None:
-            name=re.search(r'((.*)\s)\<br/>',str(name)).group(1)
+    if abschnitt is None:
+        return ""
+
+    text = abschnitt.get_text(" ", strip=True)
+    text = re.split(r"\bBemerkungen?\b", text, maxsplit=1, flags=re.IGNORECASE)[0].strip()
+    text = re.split(r"\bTabelle\s+und\s+Spielplan\b", text, maxsplit=1, flags=re.IGNORECASE)[0].strip()
+    text = REGION_SEASON_PREFIX_RE.sub("", text).strip()
+
+    match = LEAGUE_CLASS_RE.search(text)
+    if match:
+        name = match.group(0)
+    else:
+        # Fallback: only first compact chunk, never full text block.
+        name = re.split(r"\s{2,}| - | \| ", text, maxsplit=1)[0].strip()
+
    name=re.sub(r"m[ABCDEF]-Jugend", '', name)
    name=name.replace(' Männer','')
    name=name.replace(' Frauen','')
@@ -172,27 +181,8 @@ def getliga(team):
    return(name)

 def teamspielplan(team):
-    data=fetch_table(team[TeamLink],1)
-    for x in range(len(data)):
-        data[x]=data[x][0:8]
-    response = requests.get(team[TeamLink])
-    Fi =str(response.content.decode('utf-8', 'ignore'))
-    Fi_cleaned = re.sub(r'(\s{2,})',' ',str(Fi))
-    soup = BeautifulSoup(Fi_cleaned, 'html.parser')
-    nummern=[]
-    #print(Fi_cleaned)
-    #print(soup)
-    for link in soup.find_all('a'):
-        #print(link.contents)
-        try: 
-            if len(link.contents[0]) == 4:
-                nummern.append(link.contents[0])
-        except:
-        	pass
-    del nummern[0]
-    #for x in range(len(nummern)):
-    #    data[x][3]=halle(nummern[x])
-    return(data)
+    data = fetch_table(team[TeamLink], 1)
+    return [row[0:8] for row in data]

 def Teamspielplan_kuerzen(Mannschaft):
    #if "Eschweiler" in Mannschaft:
@@ -228,30 +218,40 @@ def Teamspielplan_kuerzen(Mannschaft):
    return(Mannschaft)

 def fetch_table(Link, nummer):
-    #print(Link)
-    try:
-        response = requests.get(Link)
-    except:
-        print(Link)
-    Fi =str(response.content.decode('utf-8', 'ignore'))
-    Fi_cleaned = re.sub(r'(\s{2,})',' ',str(Fi))
-    soup = BeautifulSoup(Fi_cleaned, 'html.parser')
-    data = []
-    table = soup.find_all('table', attrs={'class':'result-set'})[nummer]
+    soup = fetch_soup(Link)
+    tables = soup.find_all('table', attrs={'class':'result-set'})
+    if nummer >= len(tables):
+        return []
+
+    table = tables[nummer]
    rows = table.find_all('tr')
-    for row in rows:
-        cols = row.find_all('td')
-        cols = [ele.text.strip() for ele in cols]
-        data.append([ele for ele in cols])
-    del data[0]
+    if not rows:
+        return []
+
+    header_cells = rows[0].find_all(['th', 'td'])
+    header = [cell.text.strip().lower() for cell in header_cells]
+    skip_indices = {index for index, title in enumerate(header) if "bemerkung" in title}
+
+    data = []
+    for row in rows[1:]:
+        cols = [ele.text.strip() for ele in row.find_all('td')]
+        if skip_indices:
+            cols = [value for index, value in enumerate(cols) if index not in skip_indices]
+        data.append(cols)
    return(data)

+def fetch_soup(link):
+    try:
+        response = REQUEST_SESSION.get(link, timeout=REQUEST_TIMEOUT)
+    except requests.RequestException:
+        print(link)
+        return BeautifulSoup("", 'html.parser')
+    Fi = str(response.content.decode('utf-8', 'ignore'))
+    Fi_cleaned = WHITESPACE_RE.sub(' ', str(Fi))
+    return BeautifulSoup(Fi_cleaned, 'html.parser')
+
 def naechsteSpiele(number):
    data=fetch_table(link_nächsteSpiele,0)
-    response = requests.get(link_nächsteSpiele)
-    Fi =str(response.content.decode('utf-8', 'ignore'))
-    Fi_cleaned = re.sub(r'(\s{2,})',' ',str(Fi))
-    soup = BeautifulSoup(Fi_cleaned, 'html.parser')
    for x in range(len(data)):
        data[x]=data[x][0:8]
    CTR = 0
@@ -261,12 +261,6 @@ def naechsteSpiele(number):
            if CTR == number:
                break
    tabelle=data[:x]
-    links=[]
-    nummern=[]
-    #for link in soup.find_all('a'):
-    #    if len(link.contents[0]) == 4:
-    #        nummern.append(link.contents[0])
-    #del nummern[0]
    for x in range(len(tabelle)):
        if tabelle[x][3] != '':
            tabelle[x][3]=halle(tabelle[x][3])
@@ -301,9 +295,7 @@ def naechsteSpiele(number):
                    tabelle[x][6] = '\\textbf{Minis}'
                else:
                    tabelle[x][7] = '\\textbf{Minis}'
-        parse=re.compile(r".\-(\w*)\s.*Jgd")
-        pokal=parse.findall(str(tabelle[x][5]))
-        print(pokal)
+        pokal=POKAL_RE.findall(str(tabelle[x][5]))
        if (len(pokal) != 0 and tabelle[x][6] !="spielfrei" and tabelle[x][7] !="spielfrei"):
            #print('POKAL!!!', pokal)
            parse=re.compile(r"\sSG\s(I.*)")
@@ -346,9 +338,9 @@ def naechsteSpiele(number):
                continue
            else:
                if tabelle[x][1] == last_date:
-                    tabelle[x][0] ==''
-                    tabelle[x][1] == ''
+                    tabelle[x][0] = ''
+                    tabelle[x][1] = ''
                else: 
                    last_date = tabelle[x][1]
                    #print(last_date)
-    return(tabelle)
+    return(tabelle)