segmentation service handle 1.2.3

parent 05195a99
from langdetect import detect from langdetect import detect
from typing import List, Dict from typing import List, Dict
import re
class LanguageSegmentationService: class LanguageSegmentationService:
""" """
...@@ -74,8 +75,17 @@ class LanguageSegmentationService: ...@@ -74,8 +75,17 @@ class LanguageSegmentationService:
if not stripped: if not stripped:
return True return True
# Pure numbers (with optional punctuation like "1." or "#1") # Check if it's a number (including decimals like 1.2, 3.14, etc.)
if stripped.replace('#', '').isdigit(): # Remove # and check if what remains is numeric (allowing dots for decimals)
cleaned = stripped.replace('#', '')
# Check for section/chapter numbering like 1.2.3 or 1.2 or just 1
# Pattern: digits separated by dots or commas
if re.match(r'^[\d.]+$', cleaned) and any(c.isdigit() for c in cleaned):
return True
# Also handle numbers with commas (like 1,000)
if re.match(r'^[\d,]+$', cleaned) and any(c.isdigit() for c in cleaned):
return True return True
# Special markdown-like markers (##, ###, etc.) # Special markdown-like markers (##, ###, etc.)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment