From 30d8afdd132166f2f1a471e010915e83daa8daae Mon Sep 17 00:00:00 2001 From: David Graham Date: Fri, 27 Sep 2024 17:40:31 +0000 Subject: [PATCH 01/20] init --- Cargo.lock | 2 +- Cargo.toml | 2 +- pyproject.toml | 2 +- python/dolma/cli/deduper.py | 24 ++++++++++++++++++++++-- src/deduper.rs | 30 ++++++++++++++++++++++++++++-- 5 files changed, 53 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a17e9105..1457f246 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -987,7 +987,7 @@ dependencies = [ [[package]] name = "dolma" -version = "1.0.9" +version = "1.0.14" dependencies = [ "adblock", "ahash", diff --git a/Cargo.toml b/Cargo.toml index 44963812..547d0d00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dolma" -version = "1.0.9" +version = "1.0.14" edition = "2021" license = "Apache-2.0" diff --git a/pyproject.toml b/pyproject.toml index e7cc8cdc..befbebaa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "1.0.14.post1" +version = "1.0.14.dev6" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py index c4a60a66..de6a43d5 100644 --- a/python/dolma/cli/deduper.py +++ b/python/dolma/cli/deduper.py @@ -1,3 +1,5 @@ +import fnmatch +import os from contextlib import ExitStack from dataclasses import dataclass from pathlib import Path @@ -99,6 +101,13 @@ class DedupeConfig: partition_index: Optional[int] = field( default=0, help="The index of the partition being processed, in the range [0, num_partitions)." ) + file_partition: Optional[bool] = field( + default=False, help="Whether or not to partition at the document level (vs at the span level)" + ) + document_dir: Optional[str] = field( + default="documents", + help="The folder in source paths to replace with 'attributes' to store results, if not 'documents'", + ) @dataclass @@ -135,7 +144,6 @@ def run(cls, parsed_config: DeduperConfig): logger = get_logger("tagger") dict_config: Dict[str, Any] = {} - with ExitStack() as stack: work_dirs = stack.enter_context(make_workdirs(parsed_config.work_dir)) @@ -146,6 +154,8 @@ def run(cls, parsed_config: DeduperConfig): "min_words": parsed_config.dedupe.min_words, "num_partitions": parsed_config.dedupe.num_partitions, "partition_index": parsed_config.dedupe.partition_index, + "file_partition": parsed_config.dedupe.file_partition, + "document_dir": parsed_config.dedupe.document_dir, } try_name = parsed_config.dedupe.name if not om.is_missing(parsed_config.dedupe, "name") else None @@ -182,7 +192,17 @@ def run(cls, parsed_config: DeduperConfig): # perform some path validation to make sure we don't call the mixer with invalid config total_matching_documents = 0 for document in parsed_config.documents: - dict_config.setdefault("documents", []).append(str(document)) + + if not any( + fnmatch.fnmatch(dict_config["dedupe"]["document_dir"], part) for part in document.split(os.sep) + ): + raise DolmaConfigError( + f"Path ({document}) does not contain expected document directory: '/{dict_config['dedupe']['document_dir']}/'. " + ) + + doc = str(document) + + dict_config.setdefault("documents", []).append(doc) current_matching_documents = sum(1 for _ in glob_path(document)) if current_matching_documents == 0: diff --git a/src/deduper.rs b/src/deduper.rs index f2ad99b1..3eb3a59d 100644 --- a/src/deduper.rs +++ b/src/deduper.rs @@ -14,8 +14,9 @@ use crate::s3_util; use crate::shard::shard_config::{CompressionConfig, WorkDirConfig}; use crate::shard::{find_objects_matching_patterns, FileCache}; use crate::wimbd::tokens::tokenize; - +use ahash::RandomState; use deduper_config::*; +use std::hash::{BuildHasher, Hash, Hasher}; pub fn run(config: DeduperConfig) -> Result { let bloom_filter = BloomFilter::initialize(&config.bloom_filter).unwrap(); @@ -33,7 +34,22 @@ pub fn run(config: DeduperConfig) -> Result { let threadpool = ThreadPool::new(config.processes); let failed_shard_count = AtomicU32::new(0); let failed_shard_count_ref = Arc::new(failed_shard_count); + let hash_builder = RandomState::with_seeds(0, 1, 2, 3); + for p in paths { + let mut hasher = hash_builder.build_hasher(); + p.hash(&mut hasher); + let hashed_path = hasher.finish(); + + if config.dedupe.file_partition.unwrap_or(false) + && hashed_path % config.dedupe.num_partitions.unwrap_or(1) + != config.dedupe.partition_index.unwrap_or(0) + { + log::info!("Hash miss for {}, skipping.", p); + continue; + } + log::info!("Processing {}", p); + let path = p.clone(); let work_dirs = config.work_dir.clone(); let dedupe = config.dedupe.clone(); @@ -123,7 +139,15 @@ fn write_attributes( let attrs_location = { let attr_prefix = format!("/attributes/{}/", attr_key); - docs_location.replace("/documents/", &attr_prefix) + docs_location.replace( + &format!( + "/{}/", + dedupe_config + .document_dir + .unwrap_or(String::from("documents")) + ), + &attr_prefix, + ) }; let local_output = cache.prepare_output(&attrs_location, label_temp)?; let mut num_processed = 0; @@ -546,6 +570,8 @@ pub mod deduper_config { pub skip_empty: Option, pub num_partitions: Option, pub partition_index: Option, + pub file_partition: Option, + pub document_dir: Option, } #[derive(Serialize, Deserialize, Clone)] From 0b724b3cbe218d82386e6cdec8b155bf6ad2a588 Mon Sep 17 00:00:00 2001 From: Tyler Murray Date: Tue, 24 Sep 2024 16:53:00 -0700 Subject: [PATCH 02/20] Also pin maturin in action (#208) --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b3ec3573..66a7742f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -288,4 +288,5 @@ jobs: MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} with: command: upload + maturin-version: 1.7.1 args: --skip-existing * From 74dec8cf85f4cb6479a2ec58d5794009e1f9d7b4 Mon Sep 17 00:00:00 2001 From: David Graham Date: Fri, 27 Sep 2024 22:12:55 +0000 Subject: [PATCH 03/20] reduced comments --- src/deduper.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/deduper.rs b/src/deduper.rs index 3eb3a59d..92fb3aff 100644 --- a/src/deduper.rs +++ b/src/deduper.rs @@ -45,10 +45,8 @@ pub fn run(config: DeduperConfig) -> Result { && hashed_path % config.dedupe.num_partitions.unwrap_or(1) != config.dedupe.partition_index.unwrap_or(0) { - log::info!("Hash miss for {}, skipping.", p); continue; } - log::info!("Processing {}", p); let path = p.clone(); let work_dirs = config.work_dir.clone(); From 968f65cca83294679afcc5de28507b5f7e48e125 Mon Sep 17 00:00:00 2001 From: David Graham Date: Tue, 1 Oct 2024 23:17:53 +0000 Subject: [PATCH 04/20] add file path test --- .../deduper/pathnotd0cumentz/000.json.gz | Bin 0 -> 25814 bytes tests/python/test_deduper.py | 37 ++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 tests/data/provided/deduper/pathnotd0cumentz/000.json.gz diff --git a/tests/data/provided/deduper/pathnotd0cumentz/000.json.gz b/tests/data/provided/deduper/pathnotd0cumentz/000.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..c1adc6453f341153d43eb314f143855f653ad981 GIT binary patch literal 25814 zcmY(pQ*b6s*sUGwiEZ0#}tS%E~6I-|5jzo$*HsVoTu5`Rnce@ol#6n-+1&%Glqn+P?*+ZIhW>N9<$Z zM_v5ir0&5vrL_APlfl`sU;BJi)N_Olc8wvq_dgNnD6ZGRVO@N2%9n#sLj0L$QkKLr z!~Ce>ZpHcW=Vzn**`XRQEcg@S%G-UT(M{bfv9+(>OZAgqTAr}D>Vedaswk1T7fvAohhk} zpwgRl>jErC)+nV0T8}}5^cS#ek2eObNzFKacA8(tty#0?kInK01IKQaFtQa#buYl+ zHkF-b*(#rL$fqeMD-xhy{>)skTJGa$ErL;N#jWe0?=>)~M1&TP`wHg$jqKodcZl>u zp<#>W8TDfNETk%;PIj$Xp*mo2&Ml2rL|T+a`3E?tSm9wE1NB#FB%R3{x<~n4i^_&C z721IQT1t}EMxm)rlbvIErN@$x`M&(c=%hjNQdilug&nX9ID=en#+vU+<=3S_a<4dR z&lr#(wXqFhN;P)ex&= zsjhh`A3Kw(ZYve}fm?c(`1dySQ~Gk2oOTCsJ;?oX(mRl~LfN2-JH@|q*@AVpTTVPZ zxkOrCf|VhDaEsyL8TFi0@`LbNspM^~LC6qT&{(cTtw`|hl6@n`uDB}reTRiXh2oZ; zmVz0vQCVpWL#tpO>#0!^&0i{xeIq27bw_f^hGJFeccW<(MM@S3=Wz{M{BE?YF9c+F z#J8D=UM7VH!$qTx+!3jZkHTcQuXGbDS<2CmNx8-O@^({{jqAONOfC;{J!PBV$99Pj zFN4j$=HQ|fx~z(nATw1Y6lvKWTa#NxX4;&LE{}zlH_adHb?g$U8If3ORa5hf1{U9Z zE>Duib|*wWuS~0FOVBAXKgV&$XQH9yOFR?2d}cz)_bf*#p_LMIN_f$=AgSh3>fq&- zn#Y6rh}#B2Zrr4-FEJtz9UOYFvmiC|%X)o^ytL#zc!188Q-z+ub-MX=LZw6k`sOW4 z3Q(X;FdqU1r|v=$w=sQm=T;MQAZvqSTxujs~l%u0l& zn;Q_M{6%ereSO9G{zyeg@fE~grW;Hj!W_ZtVd79QQOR3iGkd_@Dvw<2Z4hi;X&#P+ zcSO|ZReE#QM+s&4%|q!w){tgRq9=oW|IQd-ErrAqm_b~ zIgFTeYLisUvaMT~>tiv;zW+tbK6jkD)A*4so$855P{VhiZKWtF=F|(i8;zC7HKq&v zqt$@KnCQtJLfCC4s%ZLX;E8eHL%ib+YtUT0U1lxIPZsQ@;BwAzZNgMuo`4FRdeIb* zJYO1f3xk;^Lp1TK%W}R`splXi{-t)UhBYw<-gb)SC$789x)T2qQX%*-A5-TlGbTUW zB<;$z>Qae^Q`kF>dY7bc5CM8kSID$#c_Dm1K0XJ-#l2b}dpN=(SZ^OHvDvbrL^SPp zwyVnWS{a4{B~nc(vxg0i3S5G~>; z{^vYkCX~U(a?-VQHufdzrnfA_xCctx;;}9_`)4je?Yk~XD&X$x4{os29m=&e(UQpY zOaxLcV);0;o}Dk#*3J2OxIdu#lX>x$E4vjB zbiNnUXb6J$9g0?;U_O~(UsDuW|0@3A1{_(&*pd|e8G^aZ$$7}rJ3FoW^3a=D5B&4W zM+zRp8fEnp?)uqKq)_N|x?oXb2Hwj>K80e+92TH5@Yl*O@8gx1_OGP)Fr@GNy`&@) ztSs{E+jORY5y%48#b6iP%X)@EIu8MeVYxLgR%r}_8(WTl!4^g&lPYQSm|-_yXHi&? z_?xR>;E)xqybT#aVet4{E0!GqC?o$PiCoi2L!v#s)dP^=3NhtSS|v{m+Y7~da&QyT zJJffQ^$pB`TitY8mGIN@H@$_c^CVM7QgsU@AS z_Kn+BB6DPkb?lBT#ZmJXZo@K+t(L+m8i*QYpLln(7uf*cssXjnF0nX=5a=$hDy>|# zxlp@3l=SP`PSE3Pxs-22v@e@tmf`dzicOanxO6>BC~n&9dOfJ|Ua{M^J@U(sDp--0 z^IEZ*FG7pIU?C$z5f`}RxQAbQDdBzmVqie4Rj++QzTzufxp<-=T50k|>gz4O5$EYk zqCF*q$m;Q4xm|1n{7o^*yOq1ppmLq&w=F2bVp6o&RqFElcpZl5i+hx~tCZb2TW-Gg ziqB=)AAjYA@*v1?6Z*HQx?0Dg?g|$!YlC*Zdln`M>{BpJmC56<@wD>o&~@l0(Al>X zTX!AYqwCAiYR6?z?Fd!riQ}iMf3|#f5Da)51^9)q-Du}T-*k@;1syJ6Jm7TZSc7_Q zcq$*AD3(D|CR68m=k6}ywMfKLo_BocKp#VwHHU0r^eF247<4vIB~^)|o*-q;ol|rj z43{L3!)$iu?FBAsh9QpOKfqd_c~Qv4%& zM)+SvNDC)!c_4B1N_K$RUPSKX=jVfX|BFzdODOWXfLttO46!KW43Q&Ac6Qf@B_V`# z5#hSL&v+^-l;QphN_PaI(3G!pZgUQ5p+U_8n#3?V`jU0y@zn$7BWYu7hTNqC=FBZKXY_|+=0B8e zD!5{*PP3@@i>(UM|tp#@QFFwx5p3Bs+h3{4@xbYGT-1y=lNXZY( zxU8%1%(x|f+k4=oc#=ijh%_0oL;oxy7817A@FfmWTx|GpCH^hI3M*ce7%vm$AwN(P ziEZb?vcgyo#1l)aCFzXw(8rW~hI4Hqt|};(3~$+ft zxjO$5&(u*i-{hP~>|6%Jm-Bqr7j;zyk|Vh~{rPVQhI?bj1O z##O5o{!dD!%`^WW>Eo)%p-iz>&cOVCgpYxCwea_UQu^Ee{}Tzv3cajJV}?z&TKQvM zwNlYOkF!_A@c$`o3O`8LwTLS;*QoUIu>R_v4`ZIfSUTPYRWB7Ab!Ur+*U;N=XWC6G zRVsXanPt&@L$+NkVwFGpf?tY1A1?5jiZ|f_M0k`2HE?yUP+9&69xhhB0}oCVnu^S$ zOUP+f-a=e^I}NxeGxPe{GmzpT7PIFWc4r_=I_!?Ia{_ID_=!{@vMB7Cc6T@8zTSrN zepYT89yCWrt*`P9{y9je_e=@*Yx+5PR1rR};N^YG9#G*T;}OnSipgn7qU2QB{rz(j zRTe#)1MF=^0W7=qnou;Pl_%Rf02VCO&}Ss~s_O_hC*RDwUp(DizO(*Uiz&-sB(=?8!9g$067;59x@1g@jLxKL?G+ zaw&&h+prAhp}@GTgMVa{KO;S-BMQ5VEM_Hf})a9?QLp`j>j7==aq-T&EWS5~#nIqM!(bvbz zhV;`lFp)JW_z@%~*om1=$dkJ(czIdP4f9HqwK~^a;d_Ca6|RkQx2`E-t2^hI{pztw zWMvZN^Re|pOk+9o)OEf#3s+xvBFRGXqx?y*^25^u#pvbdhA^{ z8{9cHWrgF0frCs+> z4v=v3VHjIFq*xTG46$Z#tJ;X5-huH%0LACv%5%#wVcjy;1$yIRi#HA7Fwm|4p_QI# z4F7N$fRszX&c7xFj-4$NCezG8h1tw}-49E{*S^BUO#{0d~@4B`5U zZ4n2l!i{n1`G53{M5(%KRIkUJ!fKr4k{nZC3Jip_!({X0#5m;Jv<|>1omN`Q{4>05 zt-|;Zyj*QZ6Zqj1d7X>W3}Bx5Jge$-{?rCK%MEF5k$9e3__C<}=VjyprN#IHf+DGz zZESM=37}=0$WmdqqwG&@T6l3i+cW!=*^nw_(Y+P1RB*P_C13x(+=lEd<)CZM9l&e} z#5v3wOI?iRSwQNI4u zzSrz{`t20&XL2U)$e-Tfo0k&vrJMcFJ$2DV8;TRdsXiue`%nOO^8Q9(x0=UJ+@S5) z6lL|O5?3}G6W(wm0ue<+N#5E{esqU$!ce z_+KO(i)iDjR1|_suzZ#;0nNc%zVN?DxhNV}fBCFX0*j-wd_g>x(Z=LsR1J@Ps-e^!7V6he z62#BPE-*F^a-ES& zyF_)Q@~Pp+v&TTZ;FlqH^X+ABF$#V>GdiRN#wMZ4N=vSuAu*Cfwx#9@(usZOvh(S65;O+mbrt&=vNj>$LWcAT#j@MZVF8)- z0;_>I~QF`-)ztnI7;6liMi}VZZ zf=D4YgsTqb=1GzbEJ;wvm7dMbR_X_{!^zcIf{Yg(276wBu0^--GdyCQ1rUe;FrjY= zPCOGOJ5r0Klr**FsJIitY^bv+1>{tZpu36wedz><8t~l01HL~$JLXkHSSJX$VWB{b z5qsj3Cx9a&+0p_ef5%=S*e0AJu|nW!iGU#21UTm?qwSn`vlX_0JySy`JIX~if=h$r zi_PyDRWrk&Dp_3S7Eii(QFrO)xDFi5HwM^^TxYKGKpm>{1S1 zRhlon4{FrBf_{vYI0ft^N+iC1<)#c^SeYM>OPo zI6jxnflqSvt;^{7src20a<-db*WZq)3U^8HCzjWA*kDYmtp5Gmt`TM%ZtgEugn}sD&hv*=9%`kS( zqkWW`j-y5Ez+)){Xs$9F&RqSNxMPH{7LrjW$GXb%ci@!?e-1&Fbg==X*eCB`7r_~p z($xOH$14gT>(53N+gsO&jni$AWe=d5bu@F3#LeufQ&WtI^1WEg=>jwvTyo#KU23Fb z?tUBGI7rucJco^!gUjhP65f>s4arGRNzv4_F!J~#zeM;zG+$c?kd)juOV#;Y!Dz-2 zr(Tv;r52}Nx5;qLWwmZIzcbx!`{B`cKaI4{H(8FI`xdEGxnI>9i&?zNtzjWm@OPv3coc*=ww!^$ER}$Vhs8w!I^m!%x zWP%Onj4y`~TfAiE;nLO>x4#PZ8dXxpAV_0ViySMQ%{Akbme}8Lq2m}SR~^w>E;V)s zCryTnKnb-`4ep!~+T>KVj~*bS;$Rd+s3+8oJiCIbp#4eti&X>KV-40g0$!_liQG&$ zX#jNP%JwhwuT)bn&E+Gd&yA2tTRO9H@atPs|3Sio1c9(TSeLOBnPtaSFrFc31gV%# zT(3uVV@&pF`uynx3lah5aV zt*cJ?E!IsAod*=yx!eG`>aQC^I9vl-`fs(d3qvg$TquP(s~Og)$EBF1vz;qJQ@?YK zsN&4+xd4n$=AvAbu(L!6_xna@s4lxJF$AW3PAei_AA z!c^u$-+&E$w0ha+InQq4g1^y@2lOcY>5Qo5Z_M`;N5U;kHDu9!3~&NscdbI$%2XxlGXNUWH4o|{3}b2RWzhV2Vy zIFyi{G-_BTqZPyiDT`af2!lJ5%p2s-0)r)RjkN*npuz(TFS)P#X`pEvkMC4P9Wz2ifno<*YR6;(3s4P3Xn(T2J(xhal2SQhLoh6^4NQOE%AZ?RfmA??L~FY^{Rp38zinJ5?hJ$#iCNlJFBcX z=(!+B47%t~Xm-*{u2s|lV*ks<+Od7QCmSDYGOv@JL+kkIzYNZdlg6OYsL4z_8qK=y z$6D?ffkXlkiM5leU@5fvf4GcM{h){Pv9F^QWL#}d8SWH%Yn!55#_K^-lWPnbgxqGQ!)*u0ICr$lJE=vSxm`>h)|>K5 z(@g(j8z@xD;zttTLrXc4V>c^TW7n?v>v}TM3d&C-D$Jge8$Sku5Jlku!JD!0uItEE zXg?)##%^_c1*3}uRS~~-{6^iACSIOJv%Tqah6<7qeGJbz?zqr0aPHY9n$cmG)y$(d zN&E-&!mCoCBeeg*L;G}kzV+^u2i;a^W;xkPp;kI6lhDhCVa4ct;C1SeaGM3ES0H`l z%1nm-Uf&;OqjO^jmH(|}her#3feOhA%jMk#lyxh-UX56JYutrND-)x;%XvNXF-|HU2$Qpj7JQNeDXX6uhK0qBw_9xC zfSGQF=4EX5%4@Z#AK5U>%&5k8dxFO;+CX-{0m8`=U$^A6Rz{n;4cxuvs@WpjrN3{Z zo!=|dMY-)^f7XDfoQdc#2ByAI_Vg`h?@YQIPqCu<2~6cH1b)g*v9+Jd(?@)shvy|C zv$)p)e)N=UYZh0TQ|N{R6KB5Z6fZHAL4qH7Ah|#GxMSJ*%mjkqp1ZQ)&_DBgEsIR2 zY;+cx)W4u~ZN#UYIm}k0l`>JG4L%!2BNI)YGZoSy(O~dm6m|QqwBPHv<57m1rnYjH z&E&^D+*;X;QP3UrFUTBBG5qsej@8h58bZ$Qq_;8el}>=`C&4@{a{)y@qPYRMyeAE` zo!$hZ@r+W6t9ye(Or@!qwaKg3EPXowuu*?EHjx?wTvFgBE_`)*^8^8b$TqT)g<@n3 zUhBdC#W(udY@Dj$GX|>epwyvM2=t(V@j@`#B#vpt3|s;eAVW?Ya4HZsv!E0ZL;}x5 z;EdzW>|mxEtuwQLM6@gwFCwfVGllCqBg-j(oaj0@z$^JP@&^I=@S|*bh!&bL`+Wf& zJ&aIU##{Qz4Bbuz&EA>vpg;=n#m#wgV>v-z;st_Ar~HYFrFpI-{6jC*&@3r3@B-gJ zgQ$SCngSOB34LkGnbv{HnTm4JF(yo*B;c^78t$clJ20?(LkNwjz=np|s`=#^pk5CG zTRvz?tv!~IAzgrjCf;0nuTdT|hXRo>hJS)hHY~1lqR$dlc@x>7e5)u$A8)b460TQU zT^SO@#(P5eh?Nz%-$h~?cJ`E8+jsF=#1N%=H4UdP^^AmS`Fw~JP@0F6-M@hpY(6Ub zYmk>!!6=P&wnFOn7yML#(0OGb!w#{M|2I?zhI8RDLoRb&Jyb9El+bOp1) zq|YcX4jz>W+yScc*}PpATa_m@pW-~OdmEygVEamd@`CiAMaGzq$u0)ECHu!n|-jI%_n+4wEARz#JELFhpQHNIP5S40K8|5)^hf zN;}YE)+8SY?&?dg$wn3lX0D)QlV`d}puu?6dT(PG6Nd>ARQA4Mr0}<>H4=@5-rvDc zYxFAeDE*e9+gr!3!Me1pZPd)RNZ=?~fwHY?$dZ%*DstAs7V8L>kYEMgKJp)2GiVMi z3rL1eI}kKkyV*NNzFX>Q`2s_f?T%u^pz3Dgc%LAZ?Hyn6bFNe&uZnN)D%8)X!fU zud{dL6SVciA2r7p-erw09~ads;Kgll-#6ESlbL0InE#q-fT_{pB9!2%@y;EyVAY#L zE8dLvKPu|pO*h(mo{`sGeamMYb4B7@@ z0%VB?g^@)F1Mil=0le}5b>R1$`j9u#Ni)>}Ert)MY*AEP>o753+{IVcQE<-#~uMFeI0~>uFAFbf$fo zAH~D{&51Yt`RX777NTpzm^q22d$VE-%?nr82k+9^T1LiN2jHbGM-L&wSO|c_&+pOV z%fk0vWm^qcwJNFtiY&Nz$M44!l_Em?5vF~uJ4Ktoi;Ls@#bzO1kTK8&+ zf!{R?1$`h8CUd+?Re?2$nJA&9h-9{b_)@M@Pv?^9A=-w=xWBKNwa^1av@(x)lpgPT zvTGb1OtQMgkYfhN4O!#hLpNi0u3w--Y?~hj|`JeEV z=g-uhy_r45^?SR${dkH=a$+4L(Ip)vxGsFzxqGk;gMXKuiYf919BaJc$uVseUV1l`~7EMz=}}o_jkxTwEQz~21A`Uv^r_-kSnIOCC_%- zrj`E;Pl;Lc{PCW^7oTB`E9?!GtP0Ai-(9&s zJn+=8IOl+gTTP&vfl;?x?MIzD9HSDH*s+ZlOJ-?`2Z}u)Hl<-x{Rxz*kwKGThoPr% zA2CX&xC!+_^5go1-?0j4E9ix6A1@rnV}h}=nLD#61sEXBM>5v1`sm%wbRO7aS6B|+ z16>N#7>C%>emuROyAR04yqQqd#h|dLqK}{+aw(xa!$_91#zm4}rFq%nAp)+1?-=*0 zPWN8(J50pOABAY4eeWywGWgqI_0O4sP8jI#$?wE2!(oahJlK})oOYx|xMGCqemd+F z9w@ek4{u-yxDd?p*wsW*)eyG_^b+222m`O=nf5VHFam?e#0qizyiog)dOCvbHA^0e zI*lWtmoUmW_&x504vF--!-5sECiB>a>RYrAS|Ctwk!nZt^t~tvQc>EsJf@PxY%*xi zf2cE$Pke4hx&xm^T>KI=JBsaP12w|W4QKQy#EBCC;@_;TqqQZ4p8VQfuF=enKS_WwG-yB~JcsxwGGe$B}L zYS6e;N|z}x+^LiMr!_YoXlfj12VW>2d{mMQ#heQUXxI=tZL^628`tpeYQo983jsF$ zc59^!Tz7?+bdA=t(W=P>bhXjOq!9zvdAj~IC>yQ6c&!Z zX1Es+=5J*eHu~kwNJ0w4#%Ziw$?Di~A-+*jkoybup@lVcBXkWgt5I7f(KZl8vJc_{ z_!Apw!T=02B1{D1rRx|x@Rt-hQO7XW6A%M3bvUthl0%Ep@)S#HTMcnTb{O)_GrmXb z(>h}3^Qm%r>;r@1@d+AjZgSK%eLF!|r zR}W$yAu9!#g$xcj=9EU7+!7=Wl6{yt6q9`V%L^LC>N|fa;?jbOgH^6qS{6ES0FbTZ zNjQsDpj?a(zP71_|APDFn-Sus{e)5l6z$xn{QBbn8hG{?=8ZqyMj4|Z!ZR8@p5OPC zn_Hp3+(te=N%oglqLFEvyoXg7&Mp$OIozFd?qgC)G&<=P>YWUYHCctoP3Y23*1+t2 zg&~0}{oru!^3X6UCh7el872(afFAc+Qc-w;`cry&?9BmEp-2+JGfYbmjkzFL-V1B2MR!Yk^KS}cEQ8)+1~Sg$ixvS3dKIHlEwrpC z)|eGsJFQs&tlM}Lu#_!qBa_S`Uk$Zbl0l<_qj{B2HUk^4+&uB)cPAW?Q8FUJPK;Oxk)Mw><^DiIl74ey(B&U=r9UHI#jG>y;eQ9oSQ^agH${Q z$5R1FL*fZj@>pO$+JsdkRvy|j39bx_J1!v0p=aMHP&BmzjRA!2PXZ~kkq_JifLv$fenJc31u`EDKc8MC7oowS;nRGs)X_ik0vf!5Zv^^Xe; zjBv)CP-GD78N=}3Xxb$;JV{g9NB^nCK&*DDh4vjT=76E3N1?bIdYoIJb?EEVk(p3T z7I25tPC~R?`=qus@-ge2L;p^(ny~O8^h)jh4WmHipd|&htGs+Ht=MBg(3}^ytdOVl zV3`kp*2zbaJLTRmKLg|^IxyKjufK5A=9_L*iC_t^Hujl|fkx%S@qkAA?bZ&v16hac z>t&zN_7vVh0lSh9qhL^p{SxR-O{Dk-)0qU@O3^Jro#X=Zjz;emvlB+b@i;E}n|vwb zZQLyK`&WQ>J(om2e7zmo;gF$K^HzA!_yBlZ@1+Ho8J_ke*O94F9GX5#RrnQ1waD;e z?|@1;XwOZgdI{wTEH5=(76?;MJ9h6lKFIYBcYCfSmqRDj(zi*48a}Q7B0LTlx30c6 z%%^NroB$FexTwAl$O!)8`Rr`~OajX%KB3+`MtNP!X}~?Br=K5hdhGK03{XrSifhQ_ z@>mN?Np&89)cAL)CXNfH!Y4d0Uw82Cg6U!#3w|Tg>9~5jqz%UJ zB_1S~adPMcV7#}-aTvyvd2iE8?%x!8NZ4S%(?LJcbMREfMWS!Qt>I0tEQ9=4g(YK( z0z2z1-&$FGk(HFyz5tkPX4)JOEsN|VaKhcG5TgKuVQwNegN@ankN1_jRLkCu?1Rzy zJ?Qx)wL95m7i~nu8;S(oQd(qnC%hz`6!i4lY}$H!f+|)Chil-7feoI%k{q;_xw=m7 z`gA##4Rgel(+nZl9D?G?q>w9}mf7)oXt?p-eK^{k{tv$EDZ~@0<4^`>^v$tPkd^YP z)9lfBO^-V1@9bG1WV6Nv@EFLTrpZ+ly)-@Nh{KQc4u#|p#F-+h&3bz zArKjlt^QT`V9R1smz4H2BY2D3lW-b7Nj+BfMki{3i!Xk&YT&VHj+ZKI z<9+9eQjpZR_g)_LJhoR%s57eSW&Jd1cZQeA4@P|%DF*v{5SBB&v!(>4?UFx(Izm+O z*8)k^-Q>l%{w?4~S0b&!o2@{)d!$1o%V(IFD=C2?ZtTy#6;!XdhZsqHCrcs;G+q-D zfiriHiwjpbHj-h*;{w8^D3Zo=^3k?71iqWec?0$Zc>XhT+Y3ouz_VH!5muL33ocU1 zf5~{h-=w>HFd3n;hh2!msihmJ3jbcu=Pu9fd@C#tCnnBLOzv!o6(arfUq(y0aRy9S zI@<$F)7;0?dku0}>_RV56$%1pl{eg7oA|DhN8lz6vW(GENGx2;R^fpyfoD2GE(~NH zsY-fNo;4KJE8y8Hq!PAqbD5@17VXuLaw}kjwm8u(5JNiQMhruBsri~nMw-zHRXkfZ zh%K1|)tJN@fV)a_DIM5bxM33oXTlnv-lbD*Ejr~rd;ZtA_llu{W*`Vx5P5^1}2oNcCP zriD`}B=r(lj5W?+GlZc|I3vSQYAUfT(wTx@|g(+XDl2}Ha(k~p&{;$*x4 z$Hoejr8y8&M*@x^C^Yq6cxo-5LT?^iYMvCo{8E|WQhDaNJoWW|ntLFj&Y5z}T9M8V zh#LDXiXY8OWH07`gnj&*;Z{-R;bKd+?CNhueLClJokZECG?RGm5<(0w_nPS@TgY7f zDNlEX!*!0aCR?7l+GGAMjf3?T(z#z}qCse@tO|Nl+|CS^{Vk+ zp3}+{Zq1tU4jYR9Cj7?|U4WC>RT_JGEiwN))g`+s^}lsd`oKR|l^RBxLbtR#NU8yS)N9&2@cVYz~OK*@_0eI5QHKu1Gf9@Df}}m%kTB4DeWL zA?Y{1mu3yZP*U-G{abs!m5%;NK7X2|uZ;N{{*ySIzoi32+@zQm5G329jJ~Lz;~*K^ z;l=li5qib^i!U~%7g;4NHWxc$8WUb}?5T|nDh+te5`^Aq-TqoPurhX1JOXdBOrIaA zAK^nvG4A0LvEiqLZvM=Eg&=`-=Cjd z+`rQGN22%2*XjlzAAbwv!W+abyK=XOmLYBz8sx>{d@&IJ&Ta2TF8MtOnVJRxJ9> zM3Xi>J`dJkUaich3uFnQ3~Bxs(XjLmKK!S0d-zAFrE|;do>9(NsF=xBu`lUM3I)%F zE4MEOo@~(=qdm+g$ZL}p`gv3XTPI@;H@w)zPBaX`B&9KIjMSx_8_hFRYc)Q$IJlx0 zAOk377C$5@-Df`{Z2AO5WLFS^%Vt~N!z$qa&nJdiX5Gp~JTG8lOr zs0>C;rs6VF%PLPTfxH6afx6yGIZLp`-56uS{X&G%Ss;s@!W}9zCzBgHq8zWP3X(WM z_2lKI&Qwsppd7JAWH}74S%sYoi&AiGIWIl zgIb&&X4DFKHkU>?L`Pf>#`bB*rhYQIPaaHr)tS6wM! z6y3S6HglS_T7a#k$Iijnz7-0so-Wx9C91Jl((f7W;Wj^{k368p_hSoVXbsdrC6_fM%Ai{M zHiLpUuxR362Q!?1;Ary5r#7Aamd(ac+0wL<5Qj*Wn(JT~g3`i^ zM){nPgct;mGrut|c$rE4`n{ukfKPwNd}eQloE`iPMN(+rV3{3_l4T|EHt96&g6Uv( zsY^t|ZBrgIHtuK_VBUuu4Ke1%GeOQehw}p0K;qY&{2mN>kgFo|++PhQ%)+mPNX|Nq zkY!<04&~aX%$XX^)DAhz2_XW&Z z2we~`(h%BhZi2fp$W!H1si`TE?q;{E=vpKoi>r%WI06R0$_@cBF$ilzs=P}ZtnAs_ z_CuY(?&~kZP8R~3pwDJfHhbaNd;!rrt?`<~2m0XscevPynV*O(t+K%C*buRm+>3;&Mn+a6>= zTtZ@()#Sb=|64n6fT6o9CR$N4*p|>%jH|Rx7))I>`8E|7Oq_A{+8ARVV)#QI7G1`e!}`^U4VmHK^u;sjaSw*P8j> z@#>yIuT*xxbQDu-xjmE%G2N3>>s27qTDC$?x!F4HFz)Mz`M3WK z$8T!0%`L6mdv9-2?C1-0LY?$6m$$-uuFYzUv@2idCS3sIpd0=|accmnu=d9u&+t(e z1y#W5BliT7-!H|rv8KJ(Xs6E{fS~(O8f1SG7|re^1Z+TK3>so(?yX(Z9#zq z%I-5BVK2R2{DNjSO$qxbn4xQ+LCoF-+!luY%7H$8{)GoM#|1{9Nfb7DTeNXMg)R~d zHTX(^0S<&6sh~&v#o}0AF7RyB?7^j1;(&kZ6~iYF8?kh0!S3Wu@3F{EVbN_fB_(E? zU&Y7ViR4$d@#Nb9XOQB1#=QZ_kM4(L$3Bgyzs6QW=!j%_TI8*Cvd(~_n1Yus)ZPr9 zqQxX?QUhw?coLpt${JmSpXMMm@2)=s#&2Ov2K@~e|A=Dev5B^1d9u5q?m!U)+4uz- zE@Y}|L-F2$ZG<9|wCqUQ33A{z;vW7L5nM=}|K1uZaVkFyA;6-4(8g(%-t$J_(BK58 zjA1KyYh#`joN~i9RlIgSEUAZG$YC_>U$UaI#bK(r-@`C?^X!g=M41~V7G&Gb!7`ko z+{Y?nnu%7ru$Qef+c6%-&Xo_Hsp3DXOrB(sNct^x;?qV}3!5G(syNQAR=th>Tb_D- zt0g{b$y92{jfOstn=19o7W1+Z3iXphy{tKew%XK;jgl0FWX+woT(`*q6~{$}2ofnL zj7Qpb3t4&Xq(Pgm6S5hh?rwEY^$fnBqn=D#I4ADZ+t2l<99~72u7NkN(GOZkxIe6? zY@jYizV%eDxX(bcdQ0qfTLUNet3pGdbxPMAJ{DRn&YhO8XZTVI7zFYX-(8j(pWB* zBHLBa-ICJUT~aW0mDsPAGr56xpt4C#bldkSNegNC_;E-??q(uKre?Cxydpc`TL7wjFR7X7<(h6|kFTI{ zlIeuN0~r8ebMw?noqR>ZuPR8?B`Ez{Hs8VYze} zbmV`LeT|CZh(kMe6gb;1aNy zq($aWpyFEJptoBpn4JD`9Q4X-&U#jW=KhL#VL4;7eO_L=X#K~q4jXS1P;r<`Tbz(l7Xb0$3!c+c|-g^wk1t#cZ5ccSK!Ck9&ns7Pw9ajKr?vgw+S#{E({034FhdvveWU@W z@tqO-^lAELd!NWXe4uJ?XmpIth|qN3=g@VCRd2Aj_qdnG+}B%gy`~|)3*PDvVFj}q+~-%!hnqr zX{;d?8d9Ml71)kVAM)C!R1heXPD@&8Ns*QmX-ScmwAqr9ZFy~5Dzv3STl(3S3T=69 zTVC6dk{xNYBPBahvLj7*q(WDk?n;HOG~JbYU8&cVdR?j4lL|d4(v#nM{99Khxhs?0 zmC5SLWOZe-x-wbaru^2F-(;4$GE3d2Ey82(wxndMB_Hxzi#y+y3F)?_o=ix$Ek$G| zx-t{pj=Z5G4`sAn8D>|8+3j+hJsCr<#t(ba`Ch|*>&Qn}KBR)IbWb|pYf8zc{3c7> zleO*17o~&)JWlKscveG>nSWi~ECrjLuf%V!_PZqo)w-f@5#FN zWZiqR?mb!ep3Fo~W}+uc+>@E;%S`m^P5EfaM_WER^1*%S%S!iUvigmNd`LZ+y1p!b zzhNuL8)WMGLJa+u{g75<&iXQEeVL`c%u-)wsV}qCms#oy74(G)`m*kQnTdYKK9o+# zO!R>b-|W`}l&A(=BKqJ?&_`W98uHPSkG6bt21 zM@v3B@*yRe<@M_gDcQ0gQc`B3E;GS+{q)h6BD@@ZM(xLkRFHY8%e>TOLKwOqA5uZ) zrQVSa%DgaYKR%?fuDrG@C1qafJt-pdQt!#bz6?wzq%ITEV1lNPhI};T!~WKhkDh#R zD-D^YhRjlPC7V*RDJ7e>q`X1sx+!$slx1wnGB$;vn=PpzV`$21 zHf0%`ZK)uu*%b0_$}%=P(zL8*Q;5CUk$N4eCrjKE(r?L3v}8G2vH&fakXA!}lbLAA zOtfSsS~3%@hAr8X4<27j2Hld;wuER~EvYU$wIw5J$xvD{l$KCrOUBTWF|?%fE$Mtq zI^PnCY{?dF2}QQL_BZK^5N1mVv(=L}W&5^-OxrT{cFlg{UbkiW+cN03jJ7QUYs*Tv zn^I2}ye;Ev%T{a4P};H_Z5c{ihSHYhXv>J&GNQJuLtE&xE%e!zt=5+DwPj%Ku9TFa zw1u&VNYxifhZt>5KI9>vhV;P`B9cyDM4Y~eIDL_A=tGLgs&r&1BHPeMM?R#ajHn|c z64|CNvQ1xPo4&|4^kFMV)3OakIH8Y*d~k)X>~IlB=tCX~5p{)#L;#@=sV-v>afCjk zf^49!jHoL+M}!mlkoO6>iEz>v;iTV{?slcSUE5Dtj;<_+$SCyDVc|L{T-S<;(-|SM zQB&WmkLb(3=Rh;Th2+dm2)l<))mKBkgK~v0i04pVO~j4HT|AdX$DK3JB1IIDte%aT) z7L=wFRv7a&yf{xNS+cRqRDH#0NYup~3AH6isExXNKZ1CnQ?N0u%R*moZ^z>&F6~iy zIkvl&TM{Iy=72q}`Kd|=P3P;Jza9CFp*4k%jncA6JIzP?Fi2h}a>4?CnQuv} z^{>DtVLd`|VSk!){iAxw*CBF0;UZ5$G)~Eqk9;v}SlcBI>ph)2%=yiC4t(cp{{#N> zKLvrc^BF(-L;v|F{N_u!#;7h%uAh`|tYLGGA$p)~bB}c;|5%oLtZDO)(QmlH^m{H3 zSuIOOhN*)TU&onVhaAbboH{3xl?g^Uo?@P3o@B{f#yQMTm~(J9cqLE{b6m$H*Bg0y zFqFdp_{veo5IB1rUp?v=MIFPaV;uD!FJA8_M-E_u%8>)qxt|?5uv_|iUNvb@4Wso7 zn<3`Nrmc;C^PSBy>@9zb$kSo{^ptL6{nO#eNvC}}9G(v8e$zSa^xCyz5W(>J^g_=` z{@_nO3?Yml{i6d4hYs)cZV_(FnIVT$iPb(6hxlQZbe!9RG5M)qU@a)Xw`bXlr^puJ z2PUa7GL8kl#~Yh9n^H4NRh%L7I0!z6XQd?lFD$kj5-*s|1=eviONwq?N7<$YzejeO zjCa~O2*4z)V~>-m2u74OKiEDA^cPo3_P$MwZP-sojJ>YWsu~OQ6vR3_>J34w7%T)n7C!GdFF z3wS&2oL3H8Sy$cyZlQsf!9o`O?Pmh^F9wwd2C@>+d72(yr@>+)xo(Q#v)0-7YBjJQVQabA8lKEuI&mT~ExSP&I2 zOrRq#j3qxRgAZ;1t#TByBf7x8t@50wjEIV+*jtJr@!VHIR~ya%y+#zmIFN%yA}EeT zcBx=3+N|EN9){6fya{&}>F%E4DNtmRQ#UYpm+8X#9z

v}@cmq=((WE5L1qf+t%6 zkJ61zXE>PFl^7}F7khRQ#(-p=IV(V6K%jHqa~>kr8OFkudO_}H7#W5iXsLxx#F-SQUbcM$t70U(T#Gv{KJ+=dpq)3uA=5^)W8`!~b< z7aQismK~cq)Rs;0bMZh}(b6#V8xfZdj__?F#b~X1ZMpPR*VG$+MVpRapC;F1>~}zlOr7~8nhtY$V+=q+I1pGI(!e?`amn)pmGHQgmeT=%npyAzaib#fqL$?6hix(wDR)D}3?r;X( z>azoc!@IP;_BPWGgdv6}3%E5FSVpfe`BcPkINluKt%==TEYDeDk~o|IxqNsx@7!*- z;GEnUc5z|`zC$#E5Cqzzee|gfGZblp4uMaxfrukg=Qa~lB8dRquG!;61G9V_87Xj)zNdH5zFk+;CESeYM;Q9k zHAKkijsz9@ps{`c{t{};(&)}u$X9gk^(&FxN}XgDji54@R!TA`xd;He$RwC7!wlA1 z=u39%DzpjFq}{?+x7)*V9JfJifPwW&bJ`q%I7Gk?LvFheMlUKzgg|c%mfDuJG*e&; zg6Mi622eF(R0ScBdZyPFn%GhC)0^rMVRNV(1X@et)G%*sI8!~fW2rOjS88)g<;T(y zivvX*LKiKE2ZfV##(E374@&@o$s#*aIvlVkwm#i*>CV(5H3mUa?&|q99 z-1(B&5T5*NQ*!G^9=4(=YX=Ilk<@_PQ@gM$&)B3F>@!H`1cw+qd}6^b2Yw`v+6p0s zTt=X7gs|v2r+I?=Eoj5Di81*OAgS5nj~(h6Y1<_9yg&2TX^Z@CQ$ube3E9w|TiL@&$tmQ9wVj({Y?d>{~3nUi6nNGGK z^7yJr&Bh2~M%FXU8IO~KtWe@>ECnBt7+iIIWC_)|`_g%(MXYqY(hDSpCw8FFNQ9Gc z_>162|9Aj>ct3>|HotywDQH3ov-7%4s~!TE5h{?nDx5;^eO!CjNoJTrGDSy=Hp~ zc%XKiU#?{fJ{j26aP2a_3Bcj^%4R7ewnXQt6KN2EH5gA%E~q)k*=mN=y|L=l)yYx4 z)~czKh~9&!jG6I|)ju~I>RIi1Lv{N#)otVdXZro)=e@QhJhOVJ&uZe^wO+ITQ#A3- z{&B1MSxx-kR|)nRrF`C{1Eu`i@zvSalhbymIi9vgli90pZil}c45yRET{Y_*1B={H}|KiMsVFHv5)1v zdAI}^0`bDwzBLzEv#BI53p8X&s*IRp`7Z9`sm*+`bNB@IbmG*Q#%7L@(~@I``6xzE z8T!OS-1&RSaS+mX;L<2yM&eDDz!p(lHjOpfwpT7E9F#}t6=e$+1`m|Nsr3d3>3rK()p__} zSZRj95Szan0U~ssEmM2!U*C^ne zOIj3L3Uq->wH>8o9CC8xWAe1Yx>n4T{7FJ#oVDc}G@r(qH}3%BuQ zekGeUCO-&mG)_j`LayLG%fd^j<94?m%z_K)XH}@SHiD=5k=&hf{g!1yR#IQOjlx}A zh9>4s<%Z+*AiEr5+Ui~%6O&MTG2eI`e#)D;7zJdNAz8!}8p9QNPP@x@G=Ln2lFNeD z?Iev5@E}3lCbsGBZP5y+Dnz?dQ#h2VRkcTZk;phdlTo} zny-+O!5GRNo=A?B^l`meZz}eFh*Sm$!OBMHUdN&;(r|}1exEVo!^MK3LWu5!SRViz zb&$jD(TIfoQw5(V)gX?DIaML#hg*(pN)8d|OYgunzLW@Ri0F)vq=4NXE64F-HeEU0 z8P@^)nI89q_c}6%3p0OQptP75uutruL^kiW(_L0MqJb*MTU^kQ<0I0d%!;#EPbqJ5 zgcErQv}{lB+-0~(!l<~)g6@J8K7&Viwf@obv6+-UNc;Kf$t@w+O( zC_tbqU|SI9F7v`2+}kMJ$Z)V1hugzU#d-L$5)v>k6kuaIU3NESc>dm+L-y*sD zT%Yp>ZT9lA%ww^CvYzrf6`*?foIZ`zD=4fb(1%IZzGj+6YXe%c;rwi7fF3**(?%&n zjhtW{!{+DNsw6_$i67EHz6wyJ@I@18)0v`3LICmu0_XNlG<3p}oIqB1u*iHotRlD? zk(6m(oqZ9QMW6xSp05 z2_5|cv0qEp9XkId$jgaYOSGN^nYT~&Q1ha}A^K!R^)ESt1^{s$H8f&IitH%U1R370 zH+m5oI=x~TgzbBUlB@*)E8=chy3bvwel8U(RINi zG2)6IY{HOZ&!)bDq}U;jIrM9Aa=Gb|D-Z^LK_pINztoZ8%em*yd^ zepa*9Z**#n&Sy1S^mFsL+54Pk>!+&;KB=@SzU@G1b@KM*a6D=K?p5biWt1b%YM`EdSpEmh}{cVX}6YpKlERMW+>>X{eYXdb@+8rvVCxZYQH zo84FN0{Csc_a)@c-?qL4(86!)jW6l^P80LnX6;KT$A3%HOwVv8{I=Hna;f*ft+k7q zt%-r}93MF&xX>9EgK*GF@;KzO;yg^8$enM@?}EB&sHSSEw(6*^>Zv|WEX^xiE$A4j zw_vudcT~Mg^FVV;o1JEtCKrkL8ZD$TYINWZ(CDc~ABy=Vb6M~d@B?rhWpRxn>~ zBSymo`X}z2BCWG%r6LLLfLt1q7`zw34H)t8?&5?>Bq+kX^?5ecYxH}cM~tr>H#^7e z&nn>mZRzAQtS#^10c&f#xoX{XUY^v)c?>_@{0gOb|(B^ReHn+Pz_tIJovBSD)lu}&p*YCQ}f68n&Q8I131{o%vA zpj01z&^sFAyWrLJ#cRA?{>5b>*!wBOczkQL$mGi{Z8#U!(E7pGZ3dF;*z-U zEHg(Qx*7?PlQ^vz7D4G6ZV-CQ%zC({SdJHEvxNx%` zO^R;?WLG`7>TnoMcU4?Bc1Es|qN4ISSF=s~9`Atlg7?Dr8hH}q9CsO@qB23E6Q+?6 z`EB$r&Y;!hWsFc}(U2+v^2h#{>`G0O>V_bQNKh}ap&>KJuB)LlRGj3Xfb4sU-7#{zLp_^@ zG?y~E(?UwwPrjlhD)B6<-b?s+MvJ{Vlhkk4qnx9VXaVTaMn9-$*sybWMcxbtXunlE zqCf2dARG>#7(ey;?2zyIat+7xkMyNo#cJFk$?AXMH%>UV?Nk$qsb_A^WIwZ~theAd zv|Ar(up@&T2b`J&Ivn)u%m=_T+D9HvHCbF}q8pmU7ZVM=O}q(GTA>A?qavMC!3#JG zEcSR~xBihgcB|?H(<0pQgTSq9bmV*N9(iY?7xehUUcCeo4DCtcId2eR_~JVB_wd5w z0o9(G$aY8B?2n(C#2$?~3b%83c@f*5v8Ok>abh@y?@Rg~y!g8Mb=9RIeep_r$g#+8 z`_wddKJvF;G*d=1N8cgKYMkTru~xd! zEPmhv<{4PZrT3mTrC^^)-Uw>}MUe^{fX8q{7c2ED4W7ftQ_pF|p@6myH5dC}X%jRv z8qwF@cDsklD*>ZEq<98r*6&H|(?D(ls1i;U=?d?h5@RD-0+B-oNx3)jI)qsG>RfMk zg2n_*f9ZK#&jMBzHY|t3XDb9}PnNO*7xiB(r zW|>-m`$FF#A8qvq#_I#ANQh2ur-R*}ZQQx%x7Y@Y(Ml|CkgS17tQ==i;3cVe6fz;8 zn}uIl>S2?txYp*td5k&vAkr2ii+d{u;Eu&qrWFh+tigx<;vuYnz>VP8Su+OvHz3b} zn2C@T0s#ilOWWmVmj>9rEcB>hkl`o|EQUK#_f}&=#G6zTVJ^2kZwwEKfQ3*HetA7P zxqD=sKXgE+6I}J;J2(9zfLi0BvnXqjWh{u~VDzYFZ38Bis2X_uj0Q*WabZoZH58C-pbEw+{41*Tzgbv9&Kxrbm1HDu%bGYRj z=N}>Dz)r|vU~*jGrR3|t-=e;4HD4qisa*Nn|Qb~KR>fw5e3_Wasz3hl7ve6~Q9&38Bl;c#CE?ljSw#e)J7Ys6&zG744QV+%^ z{XqsIyUIu~fg}cWGu|OD4P#!z@hCq;9@u~?fP`C`#u0!~>nwa1MO^fgh}=ht=9HzE zJC{dVKkkzBNOIB}xZqkdJ`CTISSoyov;8OUS$a#k$MIc^Ux3qr#Y|!EoY*S#0U`U+ z%h0y-m(w)!9ZF;6E*P%a5?96f%3Jd^cgCGU^qWTy7%SHaFU&dp@cwYa6c?`$?@04} z6=l~_$^IjUbGAm}amA?u5QGS#zV||3yaw|1XD=*EI)`_@$Kr>2X06hXSAgGVJZ8^K ze{gPq+r~QG$Iri_k!Btm9{WZ)$Acnu9unh&L;n_Vdu^}%_85fpB}b}U^Cd(`A|DQD z^}xr-N-R%PC)k24U=Xr1w$%bi%t^kz5DA@oCD4Qx+{qyKumw7@a^wS!R=u?@C1T&F zQ&plcLKkjy*iUyN@agIW_g<2#EjYFcmUBEPQjvFO6Ee`mG9_6~Who?^Q`+KDxZChB zV75X4cC{!WIvZE-69VGvT^6AeCZLd^pUx$%J1qmw8Rijj%HB8D<%`CPU!>Dp?>$`* z_95r~z!V>l9J>(MGuDw2eD2|PNk{rP-^frNF^5Fs0xCdEWp6XV2?Q_uuP_ww)`)z< z6%cyPoMJ5-o&EWXdwGoH3!KS{`Yf-T8@kEDB^8eRc!^cCi(Y6-ZzpDdL9&S`4c!>w zQ%8dX!Fep*KvHmeTBht4T}~0fscI0-dqqFu1pCZRW!xU z2NRPEXO>{kp43(B|5FP@Jbn{3VIOYv+XA7oe)Fn6HNuB~t;Zfl(9X{?-%9RAPp zT7Db5wG4;_ofre(8~Q(;hI~Je`*3+G^g3km7*32^0iQ6|#Ral8Qr;v1u@v>}n#$u! z7|p@SF4D!_UR{dY&D1w^4SR0)9$$xs8DnbS zRh~@>7N^mQ*!I#Lpv-mn2=*L@D>`jwH@xB%m8Tb%=Kz;x8Wyjm+$WKo$1>{J^O>(K zpu(w$PcEO?Daq}`i4B8D(dN!F#94nF7VC)Mke(_=#s$o6MJ`O4hbns-TivgS|H!hH z3A|ZsabfD_Ubv$d=GpC}jq=L;q$YU4F8`6g&z;qfq6PBjQ(cN!ACSdI7Qqx2dIy*- za0xqkH&#_~%hD!2xWI7?2KzunM$=jDHQmQOx3fxj7Z+6s(HD{b@GpY5ZRR4D31T8X0b_+H^ zyC^a?IzLb@I5bPcP${ex@q%SU*NqGhpctL){2B3S3C!N|cqfFM_LPmTseqC!*>90K z2`52McXly!5FqR@LkvL9KpSfinVc{B9zejMA+gTD!Wu1(3f}3(#a%13in1_BT z#u=sARh)(#rV*~%qHmM)Q)f}X&^a;_Bmp*Y&f3HA0KbnT{U`39mY7|Zv|KEBoUUQU zb^Os)BYwxU3)Nc|t(o8RJv+ZZ z5k#WZjN+bQxML+~FIFC8J#N6MIeosbWaRi z=5+|Q>4hYWmq5ufo;>Q#{_q0#sW9!gZa-T0h`(oggn z`HZVk{;C64qw(44S@(DC+3eek#?9@w7vt9#uNrSJt{W%g>*@cuz)%;IrA8JeW+4s%2|XbbbP^`=ftO&e3GpjHG0vAXGB?mO Qz`EKB0D=V&q!tkY0AS4}G5`Po literal 0 HcmV?d00001 diff --git a/tests/python/test_deduper.py b/tests/python/test_deduper.py index 05a3081f..e4b158fb 100644 --- a/tests/python/test_deduper.py +++ b/tests/python/test_deduper.py @@ -10,6 +10,7 @@ from typing_extensions import TypedDict from dolma.cli.__main__ import main +from dolma.core.errors import DolmaConfigError from dolma.core.utils import split_words from .utils import ( @@ -24,6 +25,9 @@ TEST_DIR = Path(__file__).parent.parent DEDUPE_BY_URL = TEST_DIR / "config/dedupe-by-url.json" +DEDUPE_BAD_FILENAME = TEST_DIR / "config/filepath-bad.json" +DEDUPE_GOOD_FILENAME = TEST_DIR / "config/filepath-good.json" + DEDUPE_PARAGRAPHS = TEST_DIR / "config/dedupe-paragraphs.json" DEDUPE_PARAGRAPH_NGRAMS = TEST_DIR / "config/dedupe-paragraph-ngrams.json" @@ -48,13 +52,13 @@ def setUp(self) -> None: # upload test data upload_s3_prefix( - s3_prefix=f"{self.remote_test_prefix}", local_prefix="tests/data/provided/deduper/documents/*.gz" + s3_prefix=f"{self.remote_test_prefix}", local_prefix="tests/data/provided/deduper/*/*.gz" ) # copy provided config files to local temp dir shutil.copytree( - "tests/data/provided/deduper/documents", - f"{self.local_temp_dir}/tests/data/provided/deduper/documents", + "tests/data/provided/deduper", + f"{self.local_temp_dir}/tests/data/provided/deduper", dirs_exist_ok=True, ) @@ -82,6 +86,33 @@ def test_dedupe_by_url(self): ) return self._compare_dedupe_output(expected, computed) # pyright: ignore + def test_dedupe_bad_filepath(self): + with open(DEDUPE_BAD_FILENAME, "r") as f: + config = json.load(f) + + config["documents"][0] = f'{self.local_temp_dir}/{config["documents"][0]}' + config["bloom_filter"]["file"] = f'{self.local_temp_dir}/{config["bloom_filter"]["file"]}' + + with NamedTemporaryFile("w") as f: + json.dump(config, f) + f.flush() + + with self.assertRaises(DolmaConfigError): + main(argv=["-c", f.name, "dedupe"]) + + def test_dedupe_bad_filepath(self): + with open(DEDUPE_GOOD_FILENAME, "r") as f: + config = json.load(f) + + config["documents"][0] = f'{self.local_temp_dir}/{config["documents"][0]}' + config["bloom_filter"]["file"] = f'{self.local_temp_dir}/{config["bloom_filter"]["file"]}' + + with NamedTemporaryFile("w") as f: + json.dump(config, f) + f.flush() + + main(argv=["-c", f.name, "dedupe"]) + def test_dedupe_paragraphs(self): with open(DEDUPE_PARAGRAPHS, "r") as f: config = json.load(f) From e06f67acb14afcd188123862f53fe519f6226055 Mon Sep 17 00:00:00 2001 From: David Graham Date: Tue, 1 Oct 2024 23:45:51 +0000 Subject: [PATCH 05/20] fix --- tests/python/test_deduper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/python/test_deduper.py b/tests/python/test_deduper.py index e4b158fb..c78a6442 100644 --- a/tests/python/test_deduper.py +++ b/tests/python/test_deduper.py @@ -1,3 +1,5 @@ +import json + import json import shutil from contextlib import ExitStack @@ -100,7 +102,7 @@ def test_dedupe_bad_filepath(self): with self.assertRaises(DolmaConfigError): main(argv=["-c", f.name, "dedupe"]) - def test_dedupe_bad_filepath(self): + def test_dedupe_good_filepath(self): with open(DEDUPE_GOOD_FILENAME, "r") as f: config = json.load(f) From 2d044988688bc5dc503c289958efb09c95d97555 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 10:13:37 -0700 Subject: [PATCH 06/20] style --- tests/python/test_deduper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/python/test_deduper.py b/tests/python/test_deduper.py index c78a6442..214c8732 100644 --- a/tests/python/test_deduper.py +++ b/tests/python/test_deduper.py @@ -1,5 +1,3 @@ -import json - import json import shutil from contextlib import ExitStack From ba9d3c98cb23f9bd3ffe23b993f861aeeb753897 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 10:20:37 -0700 Subject: [PATCH 07/20] test configs --- tests/config/filepath-bad.json | 28 ++++++++++++++++++++++++++++ tests/config/filepath-good.json | 29 +++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tests/config/filepath-bad.json create mode 100644 tests/config/filepath-good.json diff --git a/tests/config/filepath-bad.json b/tests/config/filepath-bad.json new file mode 100644 index 00000000..a08ef950 --- /dev/null +++ b/tests/config/filepath-bad.json @@ -0,0 +1,28 @@ +{ + "documents": [ + "tests/data/provided/deduper/pathnotd0cumentz/000.json.gz" + ], + "work_dir": { + "input": "tests/work/temp/dedupe-para/input", + "output": "tests/work/temp/dedupe-para/output" + }, + "dedupe": { + "name": "dedupe_paragraph_ngrams", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans", + "by_ngram": { + "ngram_length": 6, + "stride": 3, + "overlap_threshold": 0.5 + } + } + }, + "bloom_filter": { + "file": "tests/work/para_bloom_filter.bin", + "size_in_bytes": 0, + "read_only": false, + "estimated_doc_count": 1000, + "desired_false_positive_rate": 0.001 + }, + "processes": 1 +} diff --git a/tests/config/filepath-good.json b/tests/config/filepath-good.json new file mode 100644 index 00000000..b2ee06e0 --- /dev/null +++ b/tests/config/filepath-good.json @@ -0,0 +1,29 @@ +{ + "documents": [ + "tests/data/provided/deduper/pathnotd0cumentz/000.json.gz" + ], + "work_dir": { + "input": "tests/work/temp/dedupe-para/input", + "output": "tests/work/temp/dedupe-para/output" + }, + "dedupe": { + "name": "dedupe_paragraph_ngrams", + "document_dir": "pathnotd0cumentz", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans", + "by_ngram": { + "ngram_length": 6, + "stride": 3, + "overlap_threshold": 0.5 + } + } + }, + "bloom_filter": { + "file": "tests/work/para_bloom_filter.bin", + "size_in_bytes": 0, + "read_only": false, + "estimated_doc_count": 1000, + "desired_false_positive_rate": 0.001 + }, + "processes": 1 +} From d13ab5c0a72f2f5fbd3fcd3ad8957984f8d35799 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 10:44:02 -0700 Subject: [PATCH 08/20] version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index befbebaa..00990758 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "1.0.14.dev6" +version = "1.0.5" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" From 35cecfb637e1c2bb6ca99535ff103e95d37db14b Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 11:00:22 -0700 Subject: [PATCH 09/20] ... --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 547d0d00..b6ef2f5b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dolma" -version = "1.0.14" +version = "1.0.5" edition = "2021" license = "Apache-2.0" From cdb92b2f7d7c90b086c26e83bfaf9b9a074f1648 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 11:56:56 -0700 Subject: [PATCH 10/20] bump? --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 00990758..168d0cf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ version = "1.0.5" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [ "anyascii>=0.3.2", "blingfire==0.1.8", From 3ada6e12084340ed3774a845307d44c8990684db Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 12:08:32 -0700 Subject: [PATCH 11/20] bunmp --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 168d0cf0..9f223e2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -222,7 +222,7 @@ recursive = true aggressive = 3 [tool.mypy] -python_version = "3.8" +python_version = "3.9" ignore_missing_imports = true no_site_packages = true allow_redefinition = false From b746d9cbf93fd423f29f2886ebda01493e5d187d Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 13:14:59 -0700 Subject: [PATCH 12/20] . --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9f223e2a..a5be8f11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ version = "1.0.5" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.8" dependencies = [ "anyascii>=0.3.2", "blingfire==0.1.8", From 526474f84baaa5fdb0238f17233a94c5c5649508 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 13:38:47 -0700 Subject: [PATCH 13/20] . --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a5be8f11..00990758 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -222,7 +222,7 @@ recursive = true aggressive = 3 [tool.mypy] -python_version = "3.9" +python_version = "3.8" ignore_missing_imports = true no_site_packages = true allow_redefinition = false From 865b25097498ebe7e5cb4ee7115391c20d1be319 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 14:32:26 -0700 Subject: [PATCH 14/20] version weirdness --- Cargo.toml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b6ef2f5b..517035db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dolma" -version = "1.0.5" +version = "1.0.15" edition = "2021" license = "Apache-2.0" diff --git a/pyproject.toml b/pyproject.toml index 00990758..14b9cbe9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "1.0.5" +version = "1.0.15" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" From 0e76d2efc238ea39358436a396f111a0db094c51 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 15:02:58 -0700 Subject: [PATCH 15/20] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 14b9cbe9..96f8c1c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ version = "1.0.15" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [ "anyascii>=0.3.2", "blingfire==0.1.8", @@ -222,7 +222,7 @@ recursive = true aggressive = 3 [tool.mypy] -python_version = "3.8" +python_version = "3.9" ignore_missing_imports = true no_site_packages = true allow_redefinition = false From 1ccd790da3be4caafe696c5a97935e3788fa3ee6 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 16:09:58 -0700 Subject: [PATCH 16/20] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 96f8c1c9..f75f1da9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -196,12 +196,12 @@ exclude = ''' | tests/work ) ''' -target-version = ["py38", "py39", "py310", "py311", "py312"] +target-version = ["py39", "py310", "py311", "py312"] [tool.isort] profile = "black" -py_version = 38 +py_version = 39 known_first_party = ["dolma"] known_local_folder = ["tests", "python"] extend_skip_glob = [ From 985871297fe6b2539942f8d7a90d2bd8543641ee Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 16:22:10 -0700 Subject: [PATCH 17/20] ci version bump --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 66a7742f..838abff0 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -100,7 +100,7 @@ jobs: if: steps.cache-venv.outputs.cache-hit != 'true' uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.9" architecture: "x64" - name: Create a new Python environment & install maturin From 259c4d94a2a117199ae585a78aaa41ccae8d5566 Mon Sep 17 00:00:00 2001 From: David Graham Date: Wed, 2 Oct 2024 23:42:17 +0000 Subject: [PATCH 18/20] formatting --- Cargo.toml | 2 +- pyproject.toml | 2 +- python/dolma/warc/processor.py | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 517035db..5d6fdadc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dolma" -version = "1.0.15" +version = "1.1.0" edition = "2021" license = "Apache-2.0" diff --git a/pyproject.toml b/pyproject.toml index f75f1da9..dc89ab78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "1.0.15" +version = "1.1.0" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index c59f6f51..474c6ca9 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -134,9 +134,10 @@ def process_single( extension = extension.replace(".gz", "").replace(".warc", "") + ".jsonl.gz" destination_path = join_path(prot, *base_dst[:-1], base_dst[-1] + extension) - with smart_open.open(source_path, "rb") as warc_file, smart_open.open( - destination_path, "wb" - ) as output_file: + with ( + smart_open.open(source_path, "rb") as warc_file, + smart_open.open(destination_path, "wb") as output_file, + ): it = ArchiveIterator(warc_file, record_types=WarcRecordType.response | WarcRecordType.warcinfo) for record in it: if record.record_type == WarcRecordType.warcinfo: From 6c3ba6c1302d85eb7c383cc27f5007907bd0916d Mon Sep 17 00:00:00 2001 From: David Graham Date: Thu, 3 Oct 2024 18:58:15 +0000 Subject: [PATCH 19/20] Erroring at the rust level instead of overriding source --- src/deduper.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/deduper.rs b/src/deduper.rs index 92fb3aff..1c34a902 100644 --- a/src/deduper.rs +++ b/src/deduper.rs @@ -135,18 +135,24 @@ fn write_attributes( ); } + let document_key = dedupe_config + .document_dir + .unwrap_or(String::from("documents")); + let attrs_location = { let attr_prefix = format!("/attributes/{}/", attr_key); - docs_location.replace( - &format!( - "/{}/", - dedupe_config - .document_dir - .unwrap_or(String::from("documents")) - ), - &attr_prefix, - ) + docs_location.replace(&format!("/{}/", &document_key), &attr_prefix) }; + + if attrs_location == docs_location { + log::error!( + "{} does not contain {} . Not writing its attributes!", + docs_location, + &document_key + ); + panic!("Attribtue would be written to document location"); + } + let local_output = cache.prepare_output(&attrs_location, label_temp)?; let mut num_processed = 0; let mut num_observed = 0; From 75a2f98b5c6573ab2b7311590bbf8f6cecaa5ca4 Mon Sep 17 00:00:00 2001 From: David Graham Date: Thu, 3 Oct 2024 13:04:18 -0700 Subject: [PATCH 20/20] Update deduper.rs --- src/deduper.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deduper.rs b/src/deduper.rs index 1c34a902..9aef26bf 100644 --- a/src/deduper.rs +++ b/src/deduper.rs @@ -150,7 +150,7 @@ fn write_attributes( docs_location, &document_key ); - panic!("Attribtue would be written to document location"); + panic!("Attribute would be written to document location!"); } let local_output = cache.prepare_output(&attrs_location, label_temp)?;