diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..746e557 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -390,6 +390,63 @@ class YouTubeConverter(DocumentConverter): return ret return None +class IpynbConverter(DocumentConverter): + """Converts Jupyter Notebook (.ipynb) files to Markdown.""" + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not ipynb + extension = kwargs.get("file_extension", "") + if extension.lower() != ".ipynb": + return None + + # Parse and convert the notebook + result = None + with open(local_path, "rt", encoding="utf-8") as fh: + notebook_content = json.load(fh) + result = self._convert(notebook_content) + + return result + + def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]: + """Helper function that converts notebook JSON content to Markdown.""" + try: + md_output = [] + title = None + + for cell in notebook_content.get("cells", []): + cell_type = cell.get("cell_type", "") + source_lines = cell.get("source", []) + + if cell_type == "markdown": + md_output.append("".join(source_lines)) + + # Extract the first # heading as title if not already found + if title is None: + for line in source_lines: + if line.startswith("# "): + title = line.lstrip("# ").strip() + break + + elif cell_type == "code": + # Code cells are wrapped in Markdown code blocks + md_output.append(f"```python\n{''.join(source_lines)}\n```") + elif cell_type == "raw": + md_output.append(f"```\n{''.join(source_lines)}\n```") + + md_text = "\n\n".join(md_output) + + # Check for title in notebook metadata + title = notebook_content.get("metadata", {}).get("title", title) + + return DocumentConverterResult( + title=title, + text_content=md_text, + ) + + except Exception as e: + raise FileConversionException(f"Error converting .ipynb file: {str(e)}") from e class BingSerpConverter(DocumentConverter): """ @@ -879,6 +936,7 @@ class MarkItDown: self.register_page_converter(WavConverter()) self.register_page_converter(Mp3Converter()) self.register_page_converter(ImageConverter()) + self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) def convert( diff --git a/tests/test_files/test_notebook.ipynb b/tests/test_files/test_notebook.ipynb new file mode 100644 index 0000000..62db0fa --- /dev/null +++ b/tests/test_files/test_notebook.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0f61db80", + "metadata": {}, + "source": [ + "# Test Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3f2a5bbd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "markitdown\n" + ] + } + ], + "source": [ + "print('markitdown')" + ] + }, + { + "cell_type": "markdown", + "id": "9b9c0468", + "metadata": {}, + "source": [ + "## Code Cell Below" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "37d8088a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "42\n" + ] + } + ], + "source": [ + "# comment in code\n", + "print(42)" + ] + }, + { + "cell_type": "markdown", + "id": "2e3177bd", + "metadata": {}, + "source": [ + "End\n", + "\n", + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + }, + "title": "Test Notebook Title" + }, + "nbformat": 4, + "nbformat_minor": 5 +}