Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ Markdown Reader is a clean and intuitive Markdown reader with real-time preview

## Editor Overview

<img width="1436" alt="Image" src="https://github.com/user-attachments/assets/0e5400c8-e26b-4309-9fa7-2127890d0f8c" />
<img width="1428" height="737" alt="Image" src="https://github.com/user-attachments/assets/4f584f5b-dd1a-4d74-9a99-ca2cd90a9994" />

---

## Preview Overview

<img width="1417" alt="Image" src="https://github.com/user-attachments/assets/a0e1ef35-0216-4d48-8dca-2dfde8244bad" />
<img width="1417" height="474" alt="Image" src="https://github.com/user-attachments/assets/9a0006ed-d269-428b-a41f-e512cc7ba9c9" />

---

Expand Down Expand Up @@ -131,6 +131,24 @@ Python >= 3.10

---

## AI-powered translation:
To enable AI-powered translation features, you need to set up API keys:

```bash
# Copy the example configuration file
cp .env.example .env
```

**How to get API keys:**
- **OpenRouter** (recommended for free tier): [openrouter.ai](https://openrouter.ai/)
- **OpenAI**: [platform.openai.com](https://platform.openai.com/)
- **Anthropic**: [console.anthropic.com](https://console.anthropic.com/)
**Solutions**:

The app will automatically switch to a fallback provider if the primary provider hits rate limits.

---

## License

This project is licensed under the **MIT License**.
Expand Down
Binary file modified README.pdf
Binary file not shown.
63 changes: 57 additions & 6 deletions markdown_reader/logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ def _is_probably_math_token(token):
if core.startswith(('http://', 'https://', 'www.', 'file://')):
return False

if '$' in core or '\(' in core or '\)' in core or '\[' in core or '\]' in core:
if '$' in core or '\\(' in core or '\\)' in core or '\\[' in core or '\\]' in core:
return False

# Reject pure lowercase English words (at least 3 chars), but allow math vars
Expand Down Expand Up @@ -625,7 +625,7 @@ def _is_markdown_media_line(stripped_line):

def _auto_wrap_bare_math_spans(markdown_text):
"""
Wrap likely inline bare math tokens with \(...\) in mixed prose lines.
Wrap likely inline bare math tokens with \\(...\\) in mixed prose lines.
Returns list of (protected_text, replacements_dict) to preserve LaTeX delimiters.

:param string markdown_text: Original markdown text.
Expand Down Expand Up @@ -1466,8 +1466,20 @@ def convert_pdf_to_markdown(pdf_path):
page = doc[page_num]
blocks = page.get_text("dict").get("blocks", [])

# Normalize block order by page coordinates so images/text keep visual reading order.
# Some PDFs return mixed block sequences where later sections can appear before figures.
indexed_blocks = list(enumerate(blocks))
indexed_blocks.sort(
key=lambda item: (
((item[1].get("bbox") or [0, 0, 0, 0])[1]),
((item[1].get("bbox") or [0, 0, 0, 0])[0]),
item[0],
)
)
ordered_blocks = [block for _, block in indexed_blocks]

font_sizes = []
for block in blocks:
for block in ordered_blocks:
if block.get("type") == 0:
for line in block.get("lines", []):
for span in line.get("spans", []):
Expand All @@ -1486,7 +1498,7 @@ def convert_pdf_to_markdown(pdf_path):
in_code_block = False
pending_list_marker = False

for block in blocks:
for block in ordered_blocks:
block_type = block.get("type")

if block_type == 1:
Expand Down Expand Up @@ -1562,6 +1574,8 @@ def convert_pdf_to_markdown(pdf_path):
continue

is_code_line = _is_pdf_code_line(line_text, has_monospace)
if not is_code_line and in_code_block and _is_pdf_code_continuation_line(line_text):
is_code_line = True
if is_code_line:
if not in_code_block:
output_lines.append("```bash")
Expand All @@ -1584,6 +1598,14 @@ def convert_pdf_to_markdown(pdf_path):
pending_list_marker = False
continue

clean_line = line_text.replace("**", "").replace("*", "").strip()
# Recover common PDF-rendered markdown subheadings like "2. Create ..."
# that may otherwise degrade into bold paragraph text.
if re.match(r'^\d+\.\s+\S+', clean_line) and (line_text.startswith("**") and line_text.endswith("**")):
output_lines.append(f"#### {clean_line}")
output_lines.append("")
continue

if max_font_size >= heading1_threshold:
clean_heading = line_text.replace("**", "").replace("*", "")
output_lines.append(f"# {clean_heading}")
Expand Down Expand Up @@ -1672,6 +1694,10 @@ def _convert_pdf_to_markdown_fallback(pdf_path):
markdown_text += line.strip() + "\n"
continue

if in_code_block and _is_pdf_code_continuation_line(line):
markdown_text += line.strip() + "\n"
continue

if in_code_block:
markdown_text += "```\n\n"
in_code_block = False
Expand Down Expand Up @@ -1723,7 +1749,7 @@ def _is_pdf_code_line(line, has_monospace_font=False):

code_patterns = [
r'^\$\s+\S+',
r'^(sudo|pip|python|python3|npm|node|git|cd|ls|mkdir|rm|cp|mv)\b',
r'^(sudo|pip|python|python3|npm|node|git|cd|ls|mkdir|rm|cp|mv|source)\b',
r'^(if|for|while|def|class|return|import|from|try|except|else|elif)\b',
r'\b(function|const|let|var|echo|export|chmod|chown|brew|apt|yum|conda)\b',
r'`[^`]+`'
Expand All @@ -1736,6 +1762,31 @@ def _is_pdf_code_line(line, has_monospace_font=False):
return False


def _is_pdf_code_continuation_line(line):
"""
Detect lines that should remain inside an already-open PDF-derived code block.

:param string line: A single extracted line.
:return: True if the line likely continues code/command content.
"""

stripped = line.strip()
if not stripped:
return False

if stripped.startswith("#"):
return True

continuation_patterns = [
r'^(source|export|set|unset|alias|PATH=)\b',
r'^(\./|\.\\|\.\./|\.\.\\)',
r'(^|\s)(venv|scripts|bin|powershell|cmd|activate)(\s|$)',
r'[/\\].*(activate|python|pip)',
]

return any(re.search(pattern, stripped, re.IGNORECASE) for pattern in continuation_patterns)


def _is_standalone_list_marker(line):
"""
Detect lines that contain only a list marker (common in PDF extraction),
Expand Down Expand Up @@ -1791,7 +1842,7 @@ def _clean_list_item(line):
"""

# Remove common list markers
cleaned = re.sub(r'^\s*[•●○▪▫■□✓✔-–—]\s+', '', line)
cleaned = re.sub(r'^\s*[•●○▪▫■□✓✔\-–—]\s+', '', line)
cleaned = re.sub(r'^\s*\d+[\.)]\s+', '', cleaned)
cleaned = re.sub(r'^\s*[a-z][\.)]\s+', '', cleaned)
cleaned = re.sub(r'^\s*[ivxIVX]+[\.)]\s+', '', cleaned)
Expand Down
Loading