From 08ed32869eae01d0b7c39944a092b90221f81ae6 Mon Sep 17 00:00:00 2001 From: yeungadrian <47532646+yeungadrian@users.noreply.github.com> Date: Fri, 3 Jan 2025 21:58:17 +0000 Subject: [PATCH] Feature/ Add xls support (#169) * add xlrd * add xls converter with tests --- pyproject.toml | 1 + src/markitdown/_markitdown.py | 27 ++++++++++++++++++++++++++- tests/test_files/test.xls | Bin 0 -> 27648 bytes tests/test_markitdown.py | 12 ++++++++++++ 4 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 tests/test_files/test.xls diff --git a/pyproject.toml b/pyproject.toml index 67f6825..9c113ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "python-pptx", "pandas", "openpyxl", + "xlrd", "pdfminer.six", "puremagic", "pydub", diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index d209b5e..50c83b4 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -726,7 +726,31 @@ class XlsxConverter(HtmlConverter): if extension.lower() != ".xlsx": return None - sheets = pd.read_excel(local_path, sheet_name=None) + sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + html_content = sheets[s].to_html(index=False) + md_content += self._convert(html_content).text_content.strip() + "\n\n" + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + +class XlsConverter(HtmlConverter): + """ + Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a XLS + extension = kwargs.get("file_extension", "") + if extension.lower() != ".xls": + return None + + sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") md_content = "" for s in sheets: md_content += f"## {s}\n" @@ -1353,6 +1377,7 @@ class MarkItDown: self.register_page_converter(BingSerpConverter()) self.register_page_converter(DocxConverter()) self.register_page_converter(XlsxConverter()) + self.register_page_converter(XlsConverter()) self.register_page_converter(PptxConverter()) self.register_page_converter(WavConverter()) self.register_page_converter(Mp3Converter()) diff --git a/tests/test_files/test.xls b/tests/test_files/test.xls new file mode 100644 index 0000000000000000000000000000000000000000..de4f368c24d489ff7aca786acc29d80652189eb9 GIT binary patch literal 27648 zcmeHQ2V7J~x1U|wf+$5qipWY)L_m5`aVesxfY=K*$|6JrL{R)9fvCh9MKo5TF^C#t z?_JbbP)t1ti zc8Api!a2GVE%I5SPjp#y9$b6N^2P-2OC*?;eR{z)kTl2tA`9pXpdod2$cTOy^o|og z(G?K#06w>CV~&uAkVcYpVn|4ELPm0Is&KeCD|MXkzxoKFI>ho&hf;97K%O??0j&s< z39+QQnbLDhdfrAk57ToXF(r~*L#?wT{C)kQJb{wK_(7EKTY9#m=N540k$tqxO-k_~ zZOLum7mxyUb`Y+|3nj56g(QUkPaT|0Kki%#I6?!zJsWVA}TKLx0r8%cA zLJ|Qr8b^{yCasqXj|bf9Bx*Ovfm*A_K7>fOW({(5YmlRR0~|aeQd`^CP#5n8 zI4~lr_Z1$|ZxF+v0SrEA-2g{JzJgZQSBcS9yA9C+8N{yG&fdk|-hE8Rt|OgfnUNja zYPW-tppD%9m;vI1QSwZ7M330iGr76T<3JvvI)Gc-ak)5XQdU3WOgsr0gQ_?SWtvH| zBb_K=WXG;jf}N`!=sm_MXbebkBOT!nin7Z&3)zL7g>ofC#D#Q`7sk{;ULIAAB~m1+ z7wStoK~Jg|Dl1|BSOZ+c;++s81Sk6 zcS~<+ucG!>q;FF}pQD1lQU!gD3i>7$^vx>hs`Oc^g8yd~bXE9G<^Ol-vqOcR2PoZK zp}iP>Mfx5U{QFeUcd4KsRY7O;ff*9Vf3{G5m|rpUB_<}znk{Qaw3&7>t)nL<9dVmyn4Qtnpq?&-OGP?IZVMTIoqZVpF_F=)kg;%SCBttnqu1%l_$RJZXuXO| zOw5~^gPoavXC$KoK4WLb{<<(8Dzm4ajGv=tD;fO?ObE%2IyIxRq&!?>UYml%U^@PuKdNd>S`np6P0MUx6(e`!(y?5IsDfRU+51+a@XsQ~uj zCKbR=*`xy9n_8e#Qwz8>wSa3=3%E740L(_4W~Huj1+c@iqXsWW5X=cPFj<8FBMSoD z0(^lq{$NjLCe}isA_&aT8wQbx)PvyWQG=RzddY)uqoy~2)Vq~Jbre;1uak=55CXe| zZ7{wz)E{r0e)X$Fb^V>>K#GX2auCTGbs&E7AVXDXBB?H7X|jmZoIU`eN1GG17KtXS z(4>4t9f+Q^in+PD5~1|KW=8-vO`k=BTl6qcLdy|wl?nh}wo)SHO1*viRzWF|NTi?? znDtD|sJKtc5?YF;0WEP?-P7 zI;KQYu274eJ9jEL4Vxyb1=0YU9x|t4(-h}eoq@N$7U&0Zrp*xy=JdyIP(ZsbyeiZJ z?T*!B2Q{?2j3(=kigC*hO4Ub8$S5|l3`BL@!j(xqh=O)#V=W?1WNW9W2`;x91SyYD z4+1+&4TErlKezYT%Yi^fxlM4AsoEyE>S(Y!;zaczu#wgM5ZFj-7{t>{J&1!G z2xOGo1n1GJZGtP+M%Y9>2&|VH1}TqF57M?VAf8_8K^)~kAfwzSxKvPW6I}H*!Y1lL zEE@w-9-$t@wlN@{Ug|;G%Yi^fxlM32q}nECav&<>iFy!N=QnKF@(A@H!p4AjdZ`EL zAO`{&Kcg-UZZ!w zEWi;rGf<2PSB$5k7&|p$m^BC$qa#lI`_Wb9VobSWycET>S0jd5rBE@t;=~s(UMLsS zk}JkrQH--1F}RMA8ck1}D0y41T#Ol4jE|xiH#K5#xgZrI5GUSv@K(8)R$MW@iefy~ zh+(F+)M)zR#D}kLDi>ppV!YLRnk|U&ZiJ_mM;y8KQn{GcTro;K&0dWdwx^Xxyg$E1 zxflzs7$u(8L5&!;rLJ;HI2v76$T6N-yaxpesF-knm zL5&!;r+Iq4+dfmdn6_LoN<7U;jTpA4d3u$Vl_?iv%N3)<(_GYuVSAdVm*hsGaxp@# z7$u(Ou0{;o(>%Qv{G_7awBw3V;%VM$#IQZh)9a^;3zXHy5T0|xysK{94CaiRHWEFR zVl^H52SR#e{GuGRLp>;X0vFVS1IliF zLejVNQU)p@PB6NXkl2JmL4bVA^QO9Nf^n7H}$aX zQ)&CBfg)4E=7T_;Z14x%yppv(Hn!Bg^r3FBJ<}^K4IH(rJTA{lqD^{%(b71siaX*m zED$EcY5}jmzzQN<9Gj2~ZwbO$fupFFF<3Sn6hhxm21Un#g3MD0gwul#(__#V3>>w@77i67`sqQqI58L<#vIDwhh1_E;w0Ao!vz^5>%FqrlTMXn;T{sr7kq3DLQ(zxZQ+iAu{97YgF!G57}1Ul1b`lp;_+S>(ly zYFS9bP3HPQ%j9GsjmgPE8k1YhZ$<(LY-;t(%A_qmg(I{TK=-3IP;7H{$j1mN*XBs5 z*M!N*G+}ZwO_&@8%r1F4N`jvb1ixsga|X1qZ0V0iLuc)Z&N?VLGX?DBA+NfY5j4;s z@Q+MrI<%Ogzw;2H8-+;AL?7VnK4@7|2sAm<{@T>7^|5PHba z*o;&RT1O=J$>^F6N$yoZ%az51R6h0#c-x!!A7H4|olMbXJ7%8^I zJ;2@9-NVz{HO@!u=j!d};pZBV;O^(@74PotD~?+!upld0>M=gfJZdS1m%@u#_YiPC z_&g8ZkUY*4KqAEFj~3)_vIxh2nxIF7rlkZkA0f;~DD%++nm;%tJt-ElA>zzf$o7hz zFabY%h*L6S9pJ%hVq&~sLTrMok7q)>t9M+0ziU984`e;WzCH>5vHr0M?!e}onCR`{ z=N0Gb?HTI>b@KNHHje<;MDKujfbdQT@Jf)t+us7x8y>bpo06IE*$Y171?G_8LVB1> zKlx;-U~#jhtUGuky04bD8CVs`?Z~scM#rF8NgH@})e=5qLeo++VJB?NpsDHNNu%7d zC!}m%bUx~&yKz`e_}?=gyY?yC86*QT>VMm!dd?#MsE9H8d27#@!_k+DyIt_GE(zc3{++4O>>K?K57-)# zaH-7ra-XnU-f<4O8x~C*bIo@7YRmh+5fy)#54h$xcI(28Ws+Sd-OE?^JD$7E<#CvH zs^5}`o3vI;))!{s%%u05`r;rHu3 zv&vO{n-6y>(rZ3fe8Fw@zSq-SHdt@G8Q$iUOUi>Yt)4u~wkX^EZOePT&ra%}V_#X2 zzh%!I;nh8zD%-tsxik8J!xw?`dJTPAQPc5ZSaqlN27G_z$`@C==Qe{uWimX4z}~=8 zo2h~as{MTUg?ZL~X2)885E)KNOL_Ide)PlUhd!U(dm}RB`MO6=zn%4(>r)(DIHfA4 zchnIxgXCu=7h{Y{ye|fYcK#G^ZhG_G!l#Q9yDd!#_;`H4$0qSElo?>{pfPaCU-%8?xdQkg{eQ- zSXOC&Id;uln=5bPJMQ|vPnfp*{9k;QJ}`S3Y0X5Y4hLl#7priFBhsr+JBp|J47 z=((opUrzCC$NOi`b0x5MzH47;ZTbA+4;?e@6KW9Eq zn7JnWUU7K%{!4Fu{_xwj=XW>$@coI|=bG>R=1!2L)#UtO|2vl_y-gY!HDa6J9~X{X zKBgOeU_tf14;Q}L^=<>tNien~pmoK#_+v-zjW-*nwIny|{mVW7xaHFMQrBA!e?JbsxV!ZFO{X+JvlX^U*UhrcG*oZ_~@a{%{2@565Sj%|}h} z-gjtT<D(K;1Hu{?Z?Z@svIM`;;z6ZxXT=~n-#$@-5MX_75c`bMNu6v(tci&;L zjl}EqAJzrewvGP2d(PAk@Ag%U`z>nC=K!p0?P30$+&}+ZmiCpTYShcvQ|F@(fBxsGRm(b$&aJ6jBMO|7eXlAh<|ML8G{@oS!f1G#`-*u)@z{dH_vd91P`ok{s zYa>k0l<%whh2PerXN~0)-TQl2o}c|XX=%@wWwqtM-0Ai7y3dr>B}1lv+-3OayZs|4 z$ERF6+^fHH|Cs|_0<#uIdQ9H@(xlyxkqg61SB!1>EYR+5ce`H;=AXUpysqRDueY%C zq}IXVds?k3o^`i-@z%jLuXlVq^9;YgUaiUcAr@(CSM->3{;|OEisAcqg*)awO8-@> z<+awPo;I=P4u3T}KmEq-$J$|ZOyFN9xZRgSESA_EYG(o z^!DAcW^TtNPH$g&JezLo@ln73wF12*sS!1y)3;=>M~ZD#HIru@={t8Wd59K4odpEG2% zarEBS?P4Pm7hj)TkmRa6xZqt`o9R!4YllvDbh+8W{MvUP!!}(VM)r7i+CK5^*h7_X zth3yC_WlFf%?f3;Dc@x6<)K8eoXz|9(4GT$WhyGPcd??D#HKH6}}j6%QWBGn|umO|+-m%64DZtSBjPtxhZPXti_v z(da8aOBUL%Jhkzu)B5GVr@KCOw8gsf8V~N zdi8{LcH?G0**R{+rXLTkk2$s3(tN^L$EC~0zt#OYY<6%(d5blDYc@UF+j_*lo&B5+ zX=@*zzoxi)b#Xh3D@zv-zhYLEWjSQ!sQEXK#P2Be+W7s3-WHqgjod$KPRtGaqKf19 zOS*`zZyDlh7-{~ok4^HYle21XXSlT!BZ!3_A6!Wsdk6M#sA6Ygd+Yo<>Tq@`6HG+xyH&g_d@ zu4v7CMHlA|s?FQ#U%YV0BfH}D!DkO$ z46n(``Qgnf-n3H=N8*gXtR$98qgP6Hee$=OGp@6(`KZtRgD2g{*KzvsboqVzRhuHT z&&3=(_{%-07*ll~-xXemQ8gn3HrAnJei+?X9)LSVsxR#h_GC?%xCOaWj{1U@Fc)Jt z0@wUpB<}cMk91WT5}gfjF}O}pkI@P?YS`MSaf#@@I{3l{tlD(QD5$+LE-A<=Sc9x$ zP0Qg1nGdx2EE@75GxH@@W4SM}HshP&ZF=ClC%53;*nnUH+y3~aPeV8-5F>hR0%wp4 zll{McdUH84E?P8((w!;26FpC-lsvj1U@#qFDNKk9?`ybo92N_Nw1PB)a?hma_Q0J; zk0&Dh&YSGosSf3-s3YroEj%sI0z@9@$HN^4IK1dlA(DL>9V>!;$0CjmD690;4Y#D| zF&MNkAUSjyM0k|~eCHYt*y+LpZNj{HCg9_XV0i6ZMfri?1 z`Cz%HK*NUT^1*(O0u3w7prJQ#`EUusrQy<$OM_>E3N*A8mk;g8rJ)76G<=ne zOEXnM!zCq`5B-TtL;vE^aN^FT;oE0y+9%pa2?J)j7^oL= zv_Wc6e|pb>o;{=(s4LtnrKP1(7^J>Z4%8np{Af92JTrVGK+1u20%{(E&y?r&_OQWS6=#PjIO*uU2 znL!Gda-er21|!?7o`Yh11^Z#>tH==q%cz887*8Zx~-TPRc%cFjoG1#!Ivs z##Q)ni2sK1l8M9^v+-bZ7_+hNOb%l<#$G0eF&pD7D`pegccBNNe+lT>SnAnW4yjCJ zVWI$yy+kPGki~2}T8@du?ddfW_bsR;5W|^LUFg{@h)?y=23=qd0e7##7B$W)qKT}R zQ2&QJpcp+j;QywyT097AaHZ8K|ChpQJ{n_z&#wlNpt+@Vw3JEuwas3|+3AccTLBh>a+@!J|x^9I0=cpAAoxacrfdAV~T}&#KeR9*%pLgdEAd+-hU)QxNP9X zM`9=~hFc8?IW#R}{3LvRauSt{h?+xV0gVMT7SLEgV*!l?G#1cUKw|-o1vD1WSU_We z|63Nobv&+kam|hEZ+y)K-%i3cH9phFwLZSmh6`m}v*Y?*`YIV+d@ zg8#w+eq?~T?*gB$klY}3hU5+jzdG&-3Cn+igrAzD@fH{W-!sLp&I;kz1Mug%@D~G` za+nhx`tlh{L_ZP=nEjJd2BfDlEZP5y!h1tOdIo$qJdWxyUKzVu8vGBSI{B!BT4vVG ziXKcxhYY0Oy2YP_z`ElX=M$+;NU8q~4OUUPODB+zeF*zcW98c;0&bAOp+kCrH2lqu W3Gf|YY=Nff None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) validate_strings(result, XLSX_TEST_STRINGS) + # Test XLS processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls")) + for test_string in XLS_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) validate_strings(result, DOCX_TEST_STRINGS)