From 46587deda72216389df797d4bcee2537537aaf92 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 10 Oct 2024 01:00:52 +0000 Subject: [PATCH 001/119] build(deps): bump redis from 0.27.3 to 0.27.4 Bumps [redis](https://github.com/redis-rs/redis-rs) from 0.27.3 to 0.27.4. - [Release notes](https://github.com/redis-rs/redis-rs/releases) - [Commits](https://github.com/redis-rs/redis-rs/compare/redis-0.27.3...redis-0.27.4) --- updated-dependencies: - dependency-name: redis dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d3064890d..54c4cc450 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5513,7 +5513,7 @@ dependencies = [ "rand_hc", "rand_xoshiro", "rayon", - "redis 0.27.3", + "redis 0.27.4", "regex", "reqwest", "rfd", @@ -5905,9 +5905,9 @@ dependencies = [ [[package]] name = "redis" -version = "0.27.3" +version = "0.27.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92f61607c4c4442b575fbc3f31a5dd4e5dd69cfea8f6afec5b83e24f61c126ab" +checksum = "dc6baebe319ef5e4b470f248335620098d1c2e9261e995be05f56f719ca4bdb2" dependencies = [ "ahash", "arc-swap", From 67b4598225e6c5a697fa647ecff54839727fe229 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 10 Oct 2024 05:43:27 -0400 Subject: [PATCH 002/119] `benchmarks`: 0.136.0 run --- scripts/results/benchmark_results.csv | 234 +++++++++ scripts/results/benchmark_results_display.csv | 234 +++++++++ scripts/results/latest_results.csv | 467 +++++++++--------- scripts/results/latest_run_info.tsv | 2 +- scripts/results/run_info_history.tsv | 1 + 5 files changed, 704 insertions(+), 234 deletions(-) diff --git a/scripts/results/benchmark_results.csv b/scripts/results/benchmark_results.csv index 981207684..e2edc9eab 100644 --- a/scripts/results/benchmark_results.csv +++ b/scripts/results/benchmark_results.csv @@ -1,4 +1,238 @@ version,tstamp,name,mean,stddev,median,user,system,min,max,recs_per_sec +0.136.0,2024-10-09-22,apply_calcconv,1.7050352779999998,0.019588303727766737,1.706148542,3.551023666666667,0.20906266666666665,1.684914083,1.724043209,586510 +0.136.0,2024-10-09-22,apply_dynfmt,1.9835509580000001,0.01520292475588627,1.991420625,6.893937666666666,0.23447933333333335,1.966026416,1.9932058330000002,504032 +0.136.0,2024-10-09-22,apply_emptyreplace,1.5998238196666668,0.015927380642159632,1.592549459,2.684836333333333,0.22189199999999998,1.588832459,1.618089541,625000 +0.136.0,2024-10-09-22,apply_op_eudex,1.5777951106666668,0.007463856328587575,1.578263333,2.1339263333333336,0.25768166666666664,1.570108166,1.585013833,633714 +0.136.0,2024-10-09-22,apply_op_sentiment,3.9648692776666663,0.38017112229641026,3.760939625,24.910353999999998,0.294889,3.730174625,4.403493583,252207 +0.136.0,2024-10-09-22,apply_op_similarity,1.6037357640000003,0.011346707375146987,1.600556,2.227638333333333,0.2090173333333333,1.594318167,1.6163331250000001,623441 +0.136.0,2024-10-09-22,apply_op_similarity_batchall,1.4988207913333333,0.018303877114343062,1.492236125,1.7975236666666665,0.173787,1.4847202080000002,1.519506041,667111 +0.136.0,2024-10-09-22,apply_op_string,1.5737780416666667,0.016098774120262248,1.568187583,2.680716,0.19417966666666664,1.56121975,1.5919267920000002,635324 +0.136.0,2024-10-09-22,behead,0.8816215836666667,0.0029518552016063676,0.880686084,0.8362063333333333,0.04412866666666667,0.879250834,0.8849278330000001,1133787 +0.136.0,2024-10-09-22,behead_flexible,0.8807327083333334,0.0007861678845566681,0.8803495,0.8331896666666667,0.04612466666666667,0.880211625,0.881637,1135074 +0.136.0,2024-10-09-22,cat_columns,2.844363097,0.004815946161097188,2.846609875,2.747449,0.09545666666666665,2.8388343330000003,2.847645083,351617 +0.136.0,2024-10-09-22,cat_rows,1.733864903,0.01796410766456159,1.739907917,1.6407429999999998,0.09199366666666668,1.7136585000000002,1.748028292,576701 +0.136.0,2024-10-09-22,cat_rows_flexible,1.7142814723333337,0.005350188438718356,1.7144205000000001,1.6214606666666667,0.09174066666666665,1.7088631250000001,1.719560792,583431 +0.136.0,2024-10-09-22,cat_rowskey,3.1547543889999994,0.025507955662027824,3.144587958,3.05395,0.09971533333333334,3.135897292,3.183777917,316957 +0.136.0,2024-10-09-22,count,0.07404358300000001,0.0033655614659618086,0.074128917,0.4266313333333333,0.052357999999999995,0.070636166,0.077365666,13513514 +0.136.0,2024-10-09-22,count_flexible,0.462141944,0.0017590328841335147,0.462686458,0.417162,0.04387433333333333,0.460175041,0.46356433300000005,2164502 +0.136.0,2024-10-09-22,count_index,0.007620777666666666,0.000044225044650439403,0.007597292,0.0047279999999999996,0.0022673333333333334,0.00759325,0.007671791000000001,125000000 +0.136.0,2024-10-09-22,count_no_polars,0.46205180533333334,0.003785031735211785,0.460538875,0.4169433333333334,0.044038,0.45925725,0.466359291,2164502 +0.136.0,2024-10-09-22,count_polars_lowmem,0.057583291666666675,0.004786411221385849,0.056398750000000004,0.42018833333333333,0.053239,0.053500375,0.06285075000000001,17241379 +0.136.0,2024-10-09-22,count_width,0.4694743473333334,0.0008699056995136294,0.46932366700000006,0.4387203333333333,0.051019333333333326,0.46868962500000005,0.47040975,2132196 +0.136.0,2024-10-09-22,count_width_index,0.47969713900000005,0.0014514370775266074,0.479283334,0.4478786666666667,0.052171666666666665,0.478497541,0.481310542,2083333 +0.136.0,2024-10-09-22,datefmt,1.774153125,0.0035590442597932476,1.774050667,4.817035999999999,0.18269099999999996,1.770646416,1.777762292,563698 +0.136.0,2024-10-09-22,datefmt_formatstr_newcol,1.696654791666667,0.0033475541032160095,1.6982641250000001,4.038033333333334,0.18209666666666666,1.6928065,1.69889375,589275 +0.136.0,2024-10-09-22,datefmt_multi,2.127302097333333,0.004210234557934205,2.129608458,8.931267,0.183364,2.122442625,2.129855209,470146 +0.136.0,2024-10-09-22,datefmt_multi_batchall,2.076182375,0.00452843165628292,2.0749605,8.506684666666667,0.16247333333333333,2.07239025,2.081196375,481696 +0.136.0,2024-10-09-22,datefmt_multi_select,2.3498884996666667,0.010352363040544177,2.349666833,11.608551999999998,0.19084,2.33964875,2.360349916,425532 +0.136.0,2024-10-09-22,dedup,1.2833504163333334,0.006031065668196462,1.285843375,2.0846046666666664,0.13363299999999997,1.276472541,1.287735333,779423 +0.136.0,2024-10-09-22,dedup_sorted,0.988590222,0.0025965061259993253,0.9875626660000001,0.9281836666666666,0.106696,0.9866647500000001,0.9915432500000001,1011122 +0.136.0,2024-10-09-22,diff,2.644915194333333,0.02362920456233022,2.636728375,4.132400666666666,0.1896433333333333,2.626468167,2.671549041,378072 +0.136.0,2024-10-09-22,enum,0.8937754166666667,0.004067882760879962,0.891814875,0.845328,0.047180666666666655,0.891059084,0.898452291,1118568 +0.136.0,2024-10-09-22,enum_constant,0.8840505833333334,0.008684506612208931,0.8846767080000001,0.8348966666666665,0.048019,0.8750699590000001,0.8924050830000001,1131222 +0.136.0,2024-10-09-22,enum_copy,0.893037556,0.01224491038664033,0.890319084,0.8450743333333334,0.04683933333333334,0.882380334,0.9064132500000001,1119821 +0.136.0,2024-10-09-22,enum_hash,1.5074435973333333,0.0061144736970913036,1.5084482920000002,1.4567166666666667,0.04959233333333333,1.500889,1.5129934999999999,663570 +0.136.0,2024-10-09-22,enum_uuid,1.6966366806666666,0.0021752266734352942,1.697039709,0.9602846666666666,0.7351523333333333,1.694288125,1.698582208,589275 +0.136.0,2024-10-09-22,enum_uuid7,1.7300057640000002,0.002183014410172075,1.730594666,0.9914836666666665,0.7373496666666667,1.727588709,1.731833917,578035 +0.136.0,2024-10-09-22,excel,12.393773708333333,0.025221651975008946,12.379599833,12.269890333333331,0.921428,12.378827542,12.42289375,80684 +0.136.0,2024-10-09-22,excel_error_format_formula,21.884125819666668,0.00715533626895283,21.885303042,21.684978333333333,1.0060536666666666,21.876454875,21.890619542,45695 +0.136.0,2024-10-09-22,excel_j1,12.915160528000001,0.04663747179989207,12.901054709,12.124392666666667,0.7840539999999999,12.877204292,12.967222583,77429 +0.136.0,2024-10-09-22,excel_metadata,12.197152514,0.05778919332351246,12.222285042,11.175934333333332,1.0180623333333332,12.131052542,12.238119958,81987 +0.136.0,2024-10-09-22,excel_metadata_short,0.949276556,0.005821559078994813,0.9460756250000001,0.31018933333333326,0.6379016666666666,0.945757834,0.955996209,1053741 +0.136.0,2024-10-09-22,excel_trim,12.549495555666667,0.03768325174515143,12.528584125,13.758439333333333,0.8868153333333333,12.526904917,12.592997625,79688 +0.136.0,2024-10-09-22,excel_trim_j1,14.098593306,0.030775290509148033,14.111557792,13.282066333333333,0.8124296666666666,14.063456917,14.120765209,70927 +0.136.0,2024-10-09-22,exclude,0.5592902643333334,0.000774786047916678,0.559157667,0.5167163333333332,0.041481333333333335,0.558590334,0.560122792,1788909 +0.136.0,2024-10-09-22,exclude_casei,0.5643613056666666,0.0031860825198504364,0.565165125,0.5201456666666666,0.04312633333333333,0.560850292,0.5670685000000001,1773050 +0.136.0,2024-10-09-22,exclude_casei_index,0.5783586943333333,0.003891602650041549,0.577528708,0.533566,0.043393666666666664,0.5749490420000001,0.5825983330000001,1730104 +0.136.0,2024-10-09-22,exclude_index,0.5735247776666667,0.000401239904309557,0.573679083,0.529978,0.04223166666666667,0.573069292,0.573825958,1742160 +0.136.0,2024-10-09-22,exclude_multi,0.8818913613333333,0.010612203719753337,0.877010917,0.8407443333333333,0.04008899999999999,0.874597375,0.894065792,1133787 +0.136.0,2024-10-09-22,exclude_multi_casei,0.889859625,0.007512512836473645,0.8877661250000001,0.849367,0.039474666666666665,0.883615916,0.898196834,1123596 +0.136.0,2024-10-09-22,exclude_multi_casei_index,0.9129257776666667,0.0040803320905515895,0.912865667,0.867269,0.04425866666666667,0.9088758330000001,0.917035833,1095290 +0.136.0,2024-10-09-22,exclude_multi_index,0.9042534306666669,0.0024943746574162032,0.9048569580000001,0.858998,0.043971333333333334,0.9015126670000001,0.906390667,1106195 +0.136.0,2024-10-09-22,explode,1.5868397643333332,0.012723681110287766,1.593969042,1.54249,0.043229333333333335,1.5721498340000002,1.5944004170000001,630120 +0.136.0,2024-10-09-22,extdedup,1.055031389,0.01862670088469005,1.04563,0.9730626666666665,0.07952733333333334,1.042979042,1.076485125,947867 +0.136.0,2024-10-09-22,extsort,0.7629452496666668,0.01000641117909401,0.757770833,1.2839026666666664,0.2641253333333333,0.7565855410000001,0.7744793750000001,1310616 +0.136.0,2024-10-09-22,fill,2.0876410416666666,0.017161149403491567,2.096520375,2.0344270000000004,0.05202266666666666,2.0678595,2.09854325,478927 +0.136.0,2024-10-09-22,fixlengths,1.3367088053333334,0.0037192935909075913,1.336161416,1.2477593333333334,0.08782333333333332,1.333293541,1.340671459,747943 +0.136.0,2024-10-09-22,flatten,5.603500527666665,0.021892915561610844,5.603754292,5.533522333333333,0.06872466666666666,5.581481833,5.625265458,178444 +0.136.0,2024-10-09-22,flatten_condensed,5.987705097,0.014473185810507367,5.995395666,5.915737333333333,0.07072133333333332,5.971010125,5.9967095,167001 +0.136.0,2024-10-09-22,fmt,1.002943764,0.015636441586786093,1.011603417,0.9553783333333333,0.046455666666666666,0.9848932920000001,1.012334583,997009 +0.136.0,2024-10-09-22,fmt_no_crlf,0.9969341666666667,0.010619823384458773,0.9984205420000001,0.9479203333333333,0.047823,0.985649458,1.0067325,1003009 +0.136.0,2024-10-09-22,fmt_no_final_newline,1.0060015279999999,0.002756928572519994,1.007205584,0.9571203333333332,0.04771533333333333,1.002847375,1.007951625,994036 +0.136.0,2024-10-09-22,foreach,0.007608791333333334,0.00019723007370158691,0.0076508750000000006,0.0046619999999999995,0.002355,0.007393916,0.007781583000000001,125000000 +0.136.0,2024-10-09-22,frequency,2.8425398056666666,0.015544831875395617,2.847457958,3.763486333333333,0.15672966666666666,2.825130792,2.8550306670000003,351741 +0.136.0,2024-10-09-22,frequency_ignorecase,3.5406878470000005,0.01270137750083542,3.54438075,4.4127849999999995,0.15055033333333334,3.52654925,3.551133541,282406 +0.136.0,2024-10-09-22,frequency_ignorecase_index,1.1237090693333334,0.05675912448248234,1.151073708,5.658496333333333,0.26584800000000003,1.05845175,1.16160175,889680 +0.136.0,2024-10-09-22,frequency_index,1.0480152363333333,0.05581236833496712,1.06061575,4.836763333333333,0.27762166666666666,0.9869797920000001,1.096450167,954198 +0.136.0,2024-10-09-22,frequency_index_stats_mode_auto,1.0070706113333332,0.008835466506475256,1.010171625,4.776612666666666,0.28615066666666666,0.997102667,1.013937542,993049 +0.136.0,2024-10-09-22,frequency_index_stats_mode_force,1.001722,0.017089296086828448,1.000352042,4.752778999999999,0.2881666666666666,0.985358916,1.019455042,998004 +0.136.0,2024-10-09-22,frequency_index_stats_mode_none,1.1178535139999999,0.008691833816215843,1.11994925,5.200261,0.2872523333333333,1.108305417,1.125305875,894454 +0.136.0,2024-10-09-22,frequency_j1,3.3876877083333334,0.029824394521667963,3.371393833,3.2935416666666666,0.09082966666666666,3.369559625,3.422109667,295159 +0.136.0,2024-10-09-22,frequency_j1_ignorecase,4.097180166666667,0.04444788013603125,4.074708084,3.998699333333333,0.09522966666666666,4.068455375,4.148377041,244081 +0.136.0,2024-10-09-22,frequency_limit20,2.8631531113333337,0.01734257567212987,2.860227167,3.7674983333333336,0.156258,2.847459625,2.881772542,349284 +0.136.0,2024-10-09-22,frequency_limit20_index,1.0633833053333335,0.05690465310295847,1.081311333,4.816441333333334,0.2702363333333333,0.9996737080000001,1.109164875,940734 +0.136.0,2024-10-09-22,frequency_no_limit,4.727112305666668,0.061715317730259046,4.698074208,5.593491,0.180478,4.685272042,4.7979906670000005,211551 +0.136.0,2024-10-09-22,frequency_no_limit_index,2.6022785136666666,0.08396557520664231,2.557269,6.371206666666666,0.30891966666666665,2.55041375,2.699152791,384320 +0.136.0,2024-10-09-22,frequency_notrim,2.4027974446666667,0.008777820223043718,2.398003917,3.2956936666666667,0.15378166666666665,2.397460083,2.412928334,416146 +0.136.0,2024-10-09-22,frequency_notrim_index,0.9821368890000001,0.04746079884980989,0.959449583,4.254895333333334,0.2751093333333333,0.950277625,1.036683459,1018330 +0.136.0,2024-10-09-22,frequency_other_sorted,2.8385583476666665,0.009154976898636062,2.842162375,3.706337,0.15396933333333332,2.828149834,2.845362834,352237 +0.136.0,2024-10-09-22,frequency_other_sorted_index,1.0439775693333333,0.04408596032803923,1.0570245,4.785868333333333,0.278931,0.9948406660000001,1.080067542,957854 +0.136.0,2024-10-09-22,frequency_selregex,0.8031119166666666,0.005922772057214628,0.8064003750000001,0.8800663333333333,0.05293966666666666,0.796274542,0.806660833,1245330 +0.136.0,2024-10-09-22,frequency_selregex_ignorecase,1.0019362636666667,0.003958131683469569,1.000608625,1.0783873333333334,0.05544966666666667,0.998812625,1.006387541,998004 +0.136.0,2024-10-09-22,frequency_sorted,2.670419513666667,0.020978018705839457,2.672767083,3.504291,0.16317866666666667,2.648366458,2.690125,374532 +0.136.0,2024-10-09-22,frequency_sorted_index,2.759115,0.006533081122469273,2.758656042,3.5680853333333338,0.14549066666666666,2.7528235,2.765865458,362450 +0.136.0,2024-10-09-22,geocode_reverse,3.635352513666667,0.02168954742554722,3.638010625,2.964048333333333,10.649058666666667,3.612456416,3.6555905,275103 +0.136.0,2024-10-09-22,geocode_reverse_batchall,3.705906264,0.0007124361086981811,3.706016417,3.307206666666666,10.218860333333334,3.705145167,3.706557208,269833 +0.136.0,2024-10-09-22,geocode_suggest,4.424326458333333,0.01332076052122252,4.41954975,21.383587000000002,2.6517473333333332,4.414052666,4.439376959,226040 +0.136.0,2024-10-09-22,geocode_suggest_batchall,4.436151958,0.011998674399309246,4.440844291,22.09025433333333,1.42405,4.422516208,4.445095375,225428 +0.136.0,2024-10-09-22,index,0.47402747233333337,0.00010761350663523526,0.474064792,0.4296609999999999,0.04324733333333333,0.47390616700000004,0.47411145800000004,2109705 +0.136.0,2024-10-09-22,input,1.6518609443333334,0.005086299425495461,1.6505388330000001,1.5999656666666666,0.05077399999999999,1.6475662500000001,1.65747775,605327 +0.136.0,2024-10-09-22,join,1.8915908750000001,0.001276399396811533,1.8912137910000002,1.351193,0.5393533333333332,1.8905455,1.893013334,528541 +0.136.0,2024-10-09-22,join_casei,1.8990923469999998,0.005914988734221181,1.899654833,1.35911,0.5389416666666667,1.892916208,1.904706,526593 +0.136.0,2024-10-09-22,joinp,0.791479514,0.001933987035821872,0.790972792,1.4077596666666665,0.08986899999999999,0.789849333,0.7936164170000001,1264223 +0.136.0,2024-10-09-22,joinp_streaming,0.025671264000000003,0.00013254458292589571,0.025618625000000003,0.034173,0.010442333333333333,0.025573125000000002,0.025822042,38461538 +0.136.0,2024-10-09-22,json,18.515705639,0.053356714059158934,18.505266583,16.98018433333333,1.5260143333333331,18.468339917,18.573510417,54007 +0.136.0,2024-10-09-22,jsonl,1.4464047779999998,0.0027990031779148127,1.445855125,7.01248,0.19835666666666665,1.443921375,1.449437834,691563 +0.136.0,2024-10-09-22,jsonl_batchall,0.007816459,0.0001457840333335582,0.007748334000000001,0.004854,0.0023936666666666663,0.007717209,0.007983834,125000000 +0.136.0,2024-10-09-22,jsonl_j1,5.819719833333333,0.011475245824118704,5.819589125,5.685633,0.13232933333333333,5.8083105,5.831259875,171821 +0.136.0,2024-10-09-22,luau_filter,7.428935666666667,0.009451951106225287,7.424287958,6.889022999999999,0.5387573333333333,7.422707417,7.439811625,134608 +0.136.0,2024-10-09-22,luau_filter_colidx,9.097977945,0.0250814889252206,9.091565834,8.539466666666666,0.5573443333333333,9.076724959,9.125643042,109914 +0.136.0,2024-10-09-22,luau_filter_no_globals,4.837267319666666,0.006909354815523144,4.840564667,4.306157333333333,0.5299393333333332,4.829327,4.841910292,206740 +0.136.0,2024-10-09-22,luau_filter_no_globals_colidx,6.395455472666666,0.018600087946171094,6.404931084,5.861872000000001,0.5323926666666666,6.374025667,6.407409667,156372 +0.136.0,2024-10-09-22,luau_filter_no_globals_colidx,6.387959667,0.010597093263644972,6.393553959,5.850938333333333,0.535815,6.37573775,6.394587292,156544 +0.136.0,2024-10-09-22,luau_filter_no_globals_no_colidx,4.827248333333333,0.010409160348532303,4.8219105,4.298935666666666,0.5270643333333334,4.820590875,4.839243625,207168 +0.136.0,2024-10-09-22,luau_multi,23.364569847333332,0.06760305298498792,23.363684875,22.726609666666665,0.6363066666666666,23.297413625,23.432611042,42799 +0.136.0,2024-10-09-22,luau_multi_colidx,24.95775619433333,0.026643842878957484,24.947410541,24.30202733333333,0.6541143333333334,24.937836792,24.98802125,40067 +0.136.0,2024-10-09-22,luau_multi_no_globals,20.82279369433333,0.2973751150338541,20.659224458,20.181841666666667,0.6393093333333333,20.643109791,21.166046834,48024 +0.136.0,2024-10-09-22,luau_multi_no_globals_colidx,22.24774802766667,0.07362856474543945,22.240344875,21.590948666666666,0.655243,22.178100708,22.3247985,44948 +0.136.0,2024-10-09-22,luau_script,33.837904138999995,0.050418620727490406,33.840241834,33.180931333333334,0.6551846666666666,33.786357333,33.88711325,29553 +0.136.0,2024-10-09-22,luau_script_colidx,35.36256690233333,0.042258863655929034,35.379295666,34.698109666666674,0.6626286666666666,35.314504625,35.393900416,28278 +0.136.0,2024-10-09-22,luau_script_no_globals,31.057191791666668,0.03675492282724094,31.065451625,30.40201233333333,0.6533836666666666,31.01700975,31.089114,32199 +0.136.0,2024-10-09-22,luau_script_no_globals_colidx,32.38958259733334,0.1005595308660732,32.3332995,31.733821666666667,0.6539403333333333,32.329767458,32.505680834,30874 +0.136.0,2024-10-09-22,partition,2.3267367083333332,0.01474690479917634,2.3239864580000003,0.9367673333333334,1.3608143333333331,2.313558542,2.342665125,429738 +0.136.0,2024-10-09-22,pseudo,1.7149044583333335,0.0013415217707730286,1.7146115000000002,1.6574523333333333,0.055862333333333326,1.7137336250000001,1.71636825,583090 +0.136.0,2024-10-09-22,pseudo_formatstr,1.9270131106666668,0.010054385149304729,1.9285091250000002,1.8668756666666664,0.058633333333333336,1.916294541,1.936235666,518941 +0.136.0,2024-10-09-22,rename,0.9871046526666668,0.0006993774664674101,0.986705166,0.9393923333333333,0.0466,0.9866965830000001,0.9879122090000001,1013171 +0.136.0,2024-10-09-22,replace,2.0904712636666667,0.005182647935350693,2.0914495,2.038176333333333,0.051205333333333325,2.084869208,2.095095083,478469 +0.136.0,2024-10-09-22,reverse,0.9752164030000001,0.003911500340501181,0.974709959,0.8659096666666667,0.10359199999999998,0.9715827920000001,0.979356458,1025641 +0.136.0,2024-10-09-22,reverse_index,6.117939041333334,0.029647164872719674,6.102424291,1.2457633333333333,4.859567999999999,6.099268708,6.152124125,163452 +0.136.0,2024-10-09-22,safenames,0.9873110416666667,0.0017477939056016701,0.9865907500000001,0.9395263333333334,0.04666033333333333,0.9860385,0.989303875,1013171 +0.136.0,2024-10-09-22,sample_10,0.5176511113333334,0.0026381009682744073,0.5161513750000001,0.4723566666666666,0.044233999999999996,0.51610475,0.520697209,1930502 +0.136.0,2024-10-09-22,sample_1000,0.5139957086666668,0.0013784006206823084,0.513462834,0.47076999999999997,0.042213,0.512963292,0.515561,1945525 +0.136.0,2024-10-09-22,sample_100000,0.6506862360000002,0.004323453892143529,0.6495751670000001,0.6026693333333334,0.046593666666666665,0.6470267500000001,0.6554567910000001,1536098 +0.136.0,2024-10-09-22,sample_100000_index,1.1417670003333333,0.002279856408883079,1.143076959,0.18067633333333333,0.9598353333333334,1.139134459,1.143089583,875657 +0.136.0,2024-10-09-22,sample_100000_seeded,0.6475213336666666,0.0007607925677195599,0.6478826670000001,0.5994356666666666,0.046645,0.6466472090000001,0.648034125,1543210 +0.136.0,2024-10-09-22,sample_100000_seeded_faster,0.6370564166666667,0.00255118044164526,0.63679675,0.5895136666666666,0.04610966666666666,0.634645,0.6397275,1569859 +0.136.0,2024-10-09-22,sample_100000_seeded_index,1.1341891393333334,0.008487503092167076,1.132666459,0.178262,0.9546233333333333,1.126566042,1.143334917,881834 +0.136.0,2024-10-09-22,sample_100000_seeded_index_faster,1.11916925,0.004465985590402288,1.117995834,0.17152766666666666,0.946349,1.115407125,1.124104791,893655 +0.136.0,2024-10-09-22,sample_100000_seeded_index_secure,1.129794125,0.013567921456163637,1.125208125,0.17367066666666664,0.9544003333333334,1.1191135,1.14506075,884956 +0.136.0,2024-10-09-22,sample_100000_seeded_secure,0.6438970140000001,0.005995088218259433,0.64145975,0.596007,0.04657666666666666,0.639504417,0.650726875,1552795 +0.136.0,2024-10-09-22,sample_1000_index,0.03282537466666667,0.00026952162709573834,0.03287625,0.017051,0.014543333333333333,0.032534041,0.033065833,30303030 +0.136.0,2024-10-09-22,sample_10_index,0.019179167,0.00016445632517176142,0.019233167000000002,0.015091666666666665,0.003276,0.0189945,0.019309834,52631579 +0.136.0,2024-10-09-22,sample_25pct_index,2.784334319333334,0.010606911422785317,2.781489958,0.41741633333333333,2.3654466666666667,2.775439583,2.796073417,359195 +0.136.0,2024-10-09-22,sample_25pct_seeded_index,2.78099625,0.00379345691057489,2.78093475,0.41989866666666664,2.3597406666666667,2.777233917,2.784820083,359583 +0.136.0,2024-10-09-22,schema,7.8167974026666664,0.040637817784430603,7.831071542,13.272739999999999,0.393381,7.770948333,7.848372333,127926 +0.136.0,2024-10-09-22,schema_index,0.11072668033333334,0.000484563093606325,0.11074679100000001,1.0238993333333333,0.062112666666666656,0.11023237500000001,0.111200875,9009009 +0.136.0,2024-10-09-22,search,0.5870271663333334,0.0010020523424938018,0.5864650410000001,0.5419103333333334,0.043955333333333325,0.586432375,0.588184083,1703578 +0.136.0,2024-10-09-22,search_file,1.0692094583333331,0.0021090949990984855,1.06957,1.021632,0.046468333333333334,1.066943333,1.071115042,935454 +0.136.0,2024-10-09-22,search_file_case_sensitive,0.9843420133333334,0.0007680726995515422,0.984066541,0.9367206666666666,0.046418999999999995,0.983749666,0.9852098330000001,1016260 +0.136.0,2024-10-09-22,search_file_case_sensitive_unicode,0.9835609446666668,0.0015630574535778763,0.9834135,0.9346326666666668,0.047799,0.982076834,0.9851925,1016260 +0.136.0,2024-10-09-22,search_file_flag,1.2531727916666668,0.0008596545780139056,1.253225583,1.203385,0.04862666666666667,1.2522879580000001,1.254004834,798085 +0.136.0,2024-10-09-22,search_file_flag_matchonly,0.8408497920000001,0.0026936119251155933,0.840080292,0.7939083333333333,0.04576266666666667,0.838624667,0.843844417,1189061 +0.136.0,2024-10-09-22,search_file_literal,0.9418367220000001,0.004136733581775356,0.9425460000000001,0.8937806666666667,0.046918999999999995,0.9373912080000001,0.9455729580000001,1061571 +0.136.0,2024-10-09-22,search_file_unicode,1.0884298053333334,0.0018019277360431514,1.0882925,1.0402263333333333,0.047083999999999994,1.086700458,1.090296458,919118 +0.136.0,2024-10-09-22,search_unicode,0.5862996943333334,0.0010450075642292427,0.585999,0.5415249999999999,0.04366233333333333,0.585438,0.587462083,1706485 +0.136.0,2024-10-09-22,searchset,1.2479794026666666,0.0033260528682583397,1.24748775,1.5688736666666667,0.10403333333333332,1.244926542,1.251523916,801282 +0.136.0,2024-10-09-22,searchset_ignorecase,1.4406021803333333,0.001061116164262995,1.4411695,1.7601093333333333,0.10427633333333335,1.439378,1.441259041,693963 +0.136.0,2024-10-09-22,searchset_unicode,1.2449286526666665,0.0048582164261754095,1.243868708,1.5592396666666666,0.10194599999999998,1.240687917,1.250229333,803213 +0.136.0,2024-10-09-22,select,0.4911789306666667,0.000493972853482189,0.491139166,0.4470453333333333,0.04304333333333333,0.490706042,0.491691584,2036660 +0.136.0,2024-10-09-22,select_regex,0.526533139,0.0018390845593602076,0.526113458,0.4818723333333333,0.04352033333333333,0.524940167,0.5285457920000001,1897533 +0.136.0,2024-10-09-22,slice_last_1k,0.008980902666666667,0.00016669415587336408,0.009022959,0.006043666666666666,0.0023713333333333334,0.008797208,0.009122541000000001,111111111 +0.136.0,2024-10-09-22,slice_last_1k_index,0.008859264,0.00013893256714320095,0.008839125,0.005673333333333333,0.0024966666666666666,0.0087315,0.009007167,111111111 +0.136.0,2024-10-09-22,slice_last_1k_json,0.011867916333333334,0.00021738081900741208,0.01192925,0.008818666666666667,0.002439,0.011626458000000001,0.012048041,83333333 +0.136.0,2024-10-09-22,slice_last_1k_json_index,0.012107569,0.000030551562365941323,0.012092708,0.008600666666666666,0.0027273333333333333,0.012087291,0.012142708,83333333 +0.136.0,2024-10-09-22,slice_one_middle,0.2571123886666667,0.0011219957566971101,0.256721875,0.23244233333333333,0.023569666666666666,0.256237833,0.258377458,3891051 +0.136.0,2024-10-09-22,slice_one_middle_index,0.007865847,0.00005689357381110821,0.007883708,0.004748666666666666,0.002436,0.007802166,0.007911667,125000000 +0.136.0,2024-10-09-22,snappy_compress,0.17543269433333333,0.006437975025409485,0.178878375,1.162416,0.10746833333333333,0.16800520800000002,0.1794145,5714286 +0.136.0,2024-10-09-22,snappy_decompress,0.4950796386666667,0.0023225127598840655,0.49471354100000003,0.47328966666666666,0.020706000000000002,0.492961917,0.497563458,2020202 +0.136.0,2024-10-09-22,snappy_validate,0.4778566806666667,0.0004231111302557911,0.47797025000000004,0.45737266666666665,0.019444,0.47738837500000003,0.478211417,2092050 +0.136.0,2024-10-09-22,sort,1.2448715416666667,0.004062296792098928,1.243819625,2.116125,0.12835066666666664,1.241438667,1.249356333,803213 +0.136.0,2024-10-09-22,sort_random_seeded,1.1755278606666666,0.01747898854165516,1.172330541,1.0638263333333333,0.106891,1.15986825,1.194384791,850340 +0.136.0,2024-10-09-22,sort_random_seeded_faster,1.1654890973333334,0.01166500440181754,1.1681225,1.0480146666666668,0.113234,1.1527325,1.175612292,858369 +0.136.0,2024-10-09-22,sort_random_seeded_secure,1.1753826666666667,0.010507493503398389,1.169346375,1.0649463333333333,0.10569433333333333,1.169286,1.187515625,851064 +0.136.0,2024-10-09-22,sortcheck_sorted,0.4984155003333333,0.0002929748303563367,0.49856108400000004,0.4543566666666667,0.04294599999999999,0.49807825000000006,0.498607167,2008032 +0.136.0,2024-10-09-22,sortcheck_unsorted,0.007578639000000001,0.00018884501208133643,0.007486417,0.004672,0.0023496666666666666,0.007453625,0.007795875000000001,125000000 +0.136.0,2024-10-09-22,sortcheck_unsorted_all,0.5326300416666666,0.002544201329297766,0.532960416,0.48673799999999995,0.044769,0.529936792,0.5349929170000001,1876173 +0.136.0,2024-10-09-22,split,0.9717815833333333,0.04966093558043338,0.9652262500000001,0.816767,0.10735266666666665,0.9257238750000001,1.024394625,1028807 +0.136.0,2024-10-09-22,split_chunks,1.0157952636666667,0.02866098694382442,1.024791417,1.2385943333333334,0.156238,0.983715416,1.038878958,984252 +0.136.0,2024-10-09-22,split_chunks_index,0.237130722,0.11520503998050603,0.203406208,1.0650579999999998,0.205462,0.142551541,0.365434417,4219409 +0.136.0,2024-10-09-22,split_chunks_index_j1,0.009002514000000001,0.00007689415139657865,0.009030916,0.005336666666666667,0.004367666666666666,0.008915459,0.009061167,111111111 +0.136.0,2024-10-09-22,split_index,0.20669505566666668,0.05728089425847374,0.23947558300000002,1.0443603333333333,0.19973866666666665,0.14055362500000002,0.240055959,4830918 +0.136.0,2024-10-09-22,split_index_j1,1.0268533613333333,0.03843516155693625,1.010459542,0.8795676666666666,0.11235266666666666,0.9993334170000001,1.070767125,973710 +0.136.0,2024-10-09-22,split_kbsize,2.036766389,0.04346158710737812,2.018941292,1.8980946666666665,0.11043033333333334,2.00505125,2.086306625,490918 +0.136.0,2024-10-09-22,sqlp,1.0465692636666668,0.013914078754769328,1.045776375,2.0121113333333334,0.27294066666666666,1.033068583,1.060862833,955110 +0.136.0,2024-10-09-22,sqlp_aggregations,0.9915284443333334,0.011169118131619549,0.9854105420000001,1.3752786666666665,0.05673066666666667,0.9847549160000001,1.004419875,1008065 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive,0.8579473750000001,0.01071784359307729,0.854041916,1.4292443333333333,0.096925,0.849729917,0.8700702920000001,1165501 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive_streaming,0.8589640693333335,0.007709582229175652,0.856374708,1.4293953333333331,0.09164933333333332,0.8528825000000001,0.867635,1164144 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive_vs_duckdb,0.22032431933333338,0.004612126504017463,0.21811233300000002,0.8804273333333333,0.12304033333333332,0.21723483300000002,0.22562579200000002,4545455 +0.136.0,2024-10-09-22,sqlp_aggregations_vs_duckdb,0.19413722266666666,0.004535591784381205,0.195525792,0.6153073333333333,0.10211199999999998,0.18906966700000002,0.19781620900000002,5154639 +0.136.0,2024-10-09-22,sqlp_format_arrow,1.7363877083333332,0.0031499168238577568,1.735523583,2.212789666666666,0.7271723333333333,1.7337600420000001,1.7398795,576037 +0.136.0,2024-10-09-22,sqlp_format_avro,1.0304819856666667,0.005013535079717769,1.029383666,1.9862596666666665,0.3737723333333333,1.026108666,1.035953625,970874 +0.136.0,2024-10-09-22,sqlp_format_json,1.0228909720000001,0.0014389669238213615,1.023682666,1.964939,0.2585943333333333,1.02123,1.02376025,977517 +0.136.0,2024-10-09-22,sqlp_format_jsonl,1.023724417,0.001900603307837816,1.022800209,1.9640849999999999,0.25551566666666664,1.022462667,1.025910375,976562 +0.136.0,2024-10-09-22,sqlp_format_parquet,1.025136014,0.0019723300715826465,1.024200291,1.9758483333333334,0.270604,1.023805709,1.027402042,975610 +0.136.0,2024-10-09-22,sqlp_format_parquet_statistics,1.025225514,0.0020774838027392698,1.025213792,1.9743553333333332,0.2804173333333333,1.023153916,1.027308834,975610 +0.136.0,2024-10-09-22,sqlp_lowmemory,1.0151198193333333,0.001987065600690942,1.015369,1.963368,0.2535543333333333,1.013019916,1.016970542,985222 +0.136.0,2024-10-09-22,sqlp_nooptimizations,1.0478791386666666,0.0008166565087663174,1.047478,2.0146046666666666,0.18714033333333335,1.047340625,1.048818791,954198 +0.136.0,2024-10-09-22,sqlp_tryparsedates,11.918393847333334,0.013805430703016874,11.914084625000001,12.856793333333334,0.2654393333333333,11.907257,11.933839917,83907 +0.136.0,2024-10-09-22,sqlp_tryparsedates_inferlen,1.9179708060000003,0.006179784729398966,1.9200773340000001,2.8641963333333336,0.25034033333333333,1.9110131670000001,1.922821917,521376 +0.136.0,2024-10-09-22,stats,1.909606736,0.0049196025849774,1.910509792,2.219391333333333,0.09494199999999998,1.904298166,1.91401225,523560 +0.136.0,2024-10-09-22,stats_create_cache,1.9097833056666669,0.004521451395342963,1.9118497090000002,2.2203649999999997,0.09286466666666666,1.904597875,1.912902333,523560 +0.136.0,2024-10-09-22,stats_everything,3.3554944723333335,0.04029503222196921,3.343843042,10.192412666666668,0.3932616666666666,3.322309,3.400331375,298063 +0.136.0,2024-10-09-22,stats_everything_create_cache,3.3900974583333334,0.018655247417147178,3.382474708,10.237715333333332,0.392744,3.376460667,3.411357,294985 +0.136.0,2024-10-09-22,stats_everything_index,1.3312123473333333,0.009204037936792824,1.32928925,10.825936333333331,0.5668519999999999,1.323121792,1.341226,751315 +0.136.0,2024-10-09-22,stats_everything_index_j1,6.614990667000001,0.15419707857364842,6.636688584,6.550748,0.21150966666666668,6.451093875,6.757189542,151172 +0.136.0,2024-10-09-22,stats_everything_index_j1_with_cache,0.007933236333333335,0.00015488744395958354,0.007865750000000001,0.004810333333333333,0.0024836666666666666,0.007823542000000001,0.008110417,125000000 +0.136.0,2024-10-09-22,stats_everything_index_with_cache,0.008150389000000001,0.00004450365283884046,0.008145625,0.004868666666666666,0.002627666666666667,0.008108459,0.008197083,125000000 +0.136.0,2024-10-09-22,stats_everything_infer_dates,7.172214625333333,0.001711103903247197,7.172136042,14.419449,0.4363456666666667,7.170544167,7.173963667,139431 +0.136.0,2024-10-09-22,stats_everything_infer_dates_index,1.892725361,0.026426721653890588,1.9041573330000001,17.084736,0.5867853333333334,1.8625072500000002,1.9115115,528262 +0.136.0,2024-10-09-22,stats_everything_infer_dates_index_with_cache,0.008059833333333334,0.00016426584580287267,0.007988208,0.004905333333333334,0.0025053333333333334,0.007943542000000001,0.00824775,125000000 +0.136.0,2024-10-09-22,stats_everything_j1,6.374932889,0.03861090214131202,6.378022792,6.5876529999999995,0.26058733333333334,6.334869875,6.411906,156863 +0.136.0,2024-10-09-22,stats_everything_sorted,3.3091379170000006,0.01562012204732685,3.309453625,9.845339666666666,0.40390433333333325,3.2933623340000002,3.324597792,302206 +0.136.0,2024-10-09-22,stats_everything_sorted_index,3.3277645556666666,0.006919087020449097,3.326532125,9.828958333333333,0.4096936666666666,3.3215445,3.335217042,300481 +0.136.0,2024-10-09-22,stats_index,0.2264774586666667,0.004532509862893005,0.224717459,2.293756,0.06589166666666667,0.22308891700000003,0.23162600000000003,4424779 +0.136.0,2024-10-09-22,stats_index_j1,1.8859975693333333,0.0014554704378813388,1.8864251250000001,1.8443606666666668,0.04049133333333333,1.884376208,1.887191375,530223 +0.136.0,2024-10-09-22,stats_index_j1_with_cache,0.007778472333333334,0.00011276332092632511,0.007799292,0.004687666666666666,0.002419333333333333,0.00765675,0.007879375000000001,125000000 +0.136.0,2024-10-09-22,stats_index_with_cache,0.007986986,0.000047691702244729985,0.007972625,0.004798,0.0025496666666666667,0.007948125,0.008040208,125000000 +0.136.0,2024-10-09-22,table,6.831468819666667,0.023255516463334465,6.827525417,5.733177333333333,1.0903183333333333,6.810437125,6.856443917,146391 +0.136.0,2024-10-09-22,to_datapackage,0.695291889,0.0036729801911510383,0.6936888750000001,2.0978336666666664,0.30246833333333334,0.6926928750000001,0.6994939170000001,1438849 +0.136.0,2024-10-09-22,to_sqlite,0.6869207916666668,0.006021595270961332,0.688884916,2.080145,0.29434066666666664,0.6801623750000001,0.691715084,1455604 +0.136.0,2024-10-09-22,to_xlsx,30.79641108333333,0.03361517688094782,30.809203667,29.30711633333333,3.063931,30.758277708,30.821751875,32472 +0.136.0,2024-10-09-22,tojsonl,5.255716042,0.02947630512929443,5.257766042,10.161964666666664,0.4150133333333333,5.22526825,5.284113834,190259 +0.136.0,2024-10-09-22,tojsonl_batchall,5.2791820136666665,0.018680752373076034,5.275491625,10.091462666666665,0.37808400000000003,5.262621875,5.299432541,189430 +0.136.0,2024-10-09-22,tojsonl_index,2.8958170693333334,0.07985544815714167,2.86844975,11.432261333333335,0.4399403333333334,2.8332435,2.985757958,345304 +0.136.0,2024-10-09-22,tojsonl_index_j1,8.995612166666666,0.019833368532575935,9.003292875,8.607582666666666,0.3818063333333333,8.973087125,9.0104565,111161 +0.136.0,2024-10-09-22,tojsonl_j1,9.050394528,0.04188608359892298,9.033382667,9.022415666666666,0.430021,9.019690875,9.098110042,110497 +0.136.0,2024-10-09-22,tojsonl_trim,5.348996528000001,0.006708534294621601,5.348959584,10.950814333333334,0.3859336666666667,5.342306542,5.355723458,186951 +0.136.0,2024-10-09-22,tojsonl_trim_j1,9.571418291666667,0.01507952192561961,9.56800225,9.526006333333333,0.4491516666666666,9.558339833,9.587912792000001,104482 +0.136.0,2024-10-09-22,transpose,2.6229285693333333,0.025532015383013567,2.611978792,2.4179046666666664,0.19915433333333332,2.60469775,2.652109166,381243 +0.136.0,2024-10-09-22,transpose_multipass,21.422549930333332,0.02144520581620947,21.414166625,19.688017333333335,1.7326546666666667,21.406562708,21.446920458,46679 +0.136.0,2024-10-09-22,validate,1.3454596946666666,0.011306645865111651,1.344570917,6.256567999999999,0.15226666666666666,1.334623667,1.3571845,743494 +0.136.0,2024-10-09-22,validate_batchall,1.2823367776666668,0.0075023406401299505,1.283977083,5.824079,0.11468199999999999,1.2741500000000001,1.28888325,780031 +0.136.0,2024-10-09-22,validate_batchall_index,1.2857204443333334,0.004130539026058984,1.286295208,5.745959666666667,0.12479633333333333,1.2813326250000001,1.2895335,777605 +0.136.0,2024-10-09-22,validate_dynenum,1.3203521386666666,0.004890651370770679,1.318326208,6.183933333333333,0.14744133333333334,1.3168,1.325930208,757576 +0.136.0,2024-10-09-22,validate_dynenum_batchall,1.272792861,0.0026021360365074747,1.273357416,5.749370666666667,0.11033633333333333,1.269954792,1.275066375,785546 +0.136.0,2024-10-09-22,validate_dynenum_batchall_index,1.271873194,0.006069566186101182,1.273125708,5.687784333333333,0.12177833333333332,1.2652750830000001,1.2772187910000001,786164 +0.136.0,2024-10-09-22,validate_dynenum_index,1.26976775,0.0012953332136516952,1.26936025,5.678887333333333,0.10425066666666666,1.268725167,1.2712178330000001,787402 +0.136.0,2024-10-09-22,validate_dynenum_no_schema,1.3317769723333333,0.00857000757395079,1.335231583,6.148984666666666,0.14937566666666666,1.322018834,1.3380805,750751 +0.136.0,2024-10-09-22,validate_dynenum_no_schema_index,1.267980167,0.004467809592701886,1.269947584,5.694462333333333,0.102148,1.262866292,1.271126625,788644 +0.136.0,2024-10-09-22,validate_dynenum_valid_output,2.2017550413333336,0.0028643772618533727,2.202526583,7.002077333333333,0.19268933333333335,2.198583916,2.204154625,454133 +0.136.0,2024-10-09-22,validate_dynenum_valid_output_index,2.1348443749999997,0.0037203540786369966,2.136084209,6.538800666666667,0.1434173333333333,2.130662416,2.1377865,468384 +0.136.0,2024-10-09-22,validate_index,1.2818570833333334,0.0032141490070814053,1.2826240420000001,5.739469666666666,0.10224566666666667,1.278328833,1.284618375,780031 +0.136.0,2024-10-09-22,validate_no_schema,0.4880847496666667,0.0036591216167233724,0.489340916,0.4425666666666666,0.04411433333333333,0.48396300000000003,0.490950333,2049180 +0.136.0,2024-10-09-22,validate_no_schema_index,0.49482838866666673,0.0009017353931089343,0.49446141600000004,0.45398833333333327,0.039464,0.49416800000000005,0.49585575000000004,2020202 +0.136.0,2024-10-09-22,validate_valid_output,2.2125143473333337,0.004183940417602296,2.21488575,7.101555,0.1944523333333333,2.207683417,2.214973875,451875 +0.136.0,2024-10-09-22,validate_valid_output_index,2.143425805666667,0.008207043638664849,2.138873375,6.604312666666666,0.13987033333333332,2.138503959,2.152900083,466636 0.135.0,2024-09-24-16,apply_calcconv,1.6561738473333334,0.024501838167906777,1.647001417,3.308348666666666,0.18862966666666667,1.6375816250000002,1.6839385,603865 0.135.0,2024-09-24-16,apply_dynfmt,2.0836702916666665,0.10193847447027907,2.039226917,5.589999,0.19001333333333328,2.011498958,2.200285,479846 0.135.0,2024-09-24-16,apply_emptyreplace,1.52477925,0.0037840099082099747,1.5236345,2.2465766666666664,0.15877699999999997,1.521699791,1.5290034590000001,655738 diff --git a/scripts/results/benchmark_results_display.csv b/scripts/results/benchmark_results_display.csv index 4d741c49c..121285f76 100644 --- a/scripts/results/benchmark_results_display.csv +++ b/scripts/results/benchmark_results_display.csv @@ -1,4 +1,238 @@ version,tstamp,name,mean,recs_per_sec,stddev,median,user,system,min,max +0.136.0,2024-10-09-22,apply_calcconv,1.705,586510,0.02,1.706,3.551,0.209,1.685,1.724 +0.136.0,2024-10-09-22,apply_dynfmt,1.984,504032,0.015,1.991,6.894,0.234,1.966,1.993 +0.136.0,2024-10-09-22,apply_emptyreplace,1.6,625000,0.016,1.593,2.685,0.222,1.589,1.618 +0.136.0,2024-10-09-22,apply_op_eudex,1.578,633714,0.007,1.578,2.134,0.258,1.57,1.585 +0.136.0,2024-10-09-22,apply_op_sentiment,3.965,252207,0.38,3.761,24.91,0.295,3.73,4.403 +0.136.0,2024-10-09-22,apply_op_similarity,1.604,623441,0.011,1.601,2.228,0.209,1.594,1.616 +0.136.0,2024-10-09-22,apply_op_similarity_batchall,1.499,667111,0.018,1.492,1.798,0.174,1.485,1.52 +0.136.0,2024-10-09-22,apply_op_string,1.574,635324,0.016,1.568,2.681,0.194,1.561,1.592 +0.136.0,2024-10-09-22,behead,0.882,1133787,0.003,0.881,0.836,0.044,0.879,0.885 +0.136.0,2024-10-09-22,behead_flexible,0.881,1135074,0.001,0.88,0.833,0.046,0.88,0.882 +0.136.0,2024-10-09-22,cat_columns,2.844,351617,0.005,2.847,2.747,0.095,2.839,2.848 +0.136.0,2024-10-09-22,cat_rows,1.734,576701,0.018,1.74,1.641,0.092,1.714,1.748 +0.136.0,2024-10-09-22,cat_rows_flexible,1.714,583431,0.005,1.714,1.621,0.092,1.709,1.72 +0.136.0,2024-10-09-22,cat_rowskey,3.155,316957,0.026,3.145,3.054,0.1,3.136,3.184 +0.136.0,2024-10-09-22,count,0.074,13513514,0.003,0.074,0.427,0.052,0.071,0.077 +0.136.0,2024-10-09-22,count_flexible,0.462,2164502,0.002,0.463,0.417,0.044,0.46,0.464 +0.136.0,2024-10-09-22,count_index,0.008,125000000,0,0.008,0.005,0.002,0.008,0.008 +0.136.0,2024-10-09-22,count_no_polars,0.462,2164502,0.004,0.461,0.417,0.044,0.459,0.466 +0.136.0,2024-10-09-22,count_polars_lowmem,0.058,17241379,0.005,0.056,0.42,0.053,0.054,0.063 +0.136.0,2024-10-09-22,count_width,0.469,2132196,0.001,0.469,0.439,0.051,0.469,0.47 +0.136.0,2024-10-09-22,count_width_index,0.48,2083333,0.001,0.479,0.448,0.052,0.478,0.481 +0.136.0,2024-10-09-22,datefmt,1.774,563698,0.004,1.774,4.817,0.183,1.771,1.778 +0.136.0,2024-10-09-22,datefmt_formatstr_newcol,1.697,589275,0.003,1.698,4.038,0.182,1.693,1.699 +0.136.0,2024-10-09-22,datefmt_multi,2.127,470146,0.004,2.13,8.931,0.183,2.122,2.13 +0.136.0,2024-10-09-22,datefmt_multi_batchall,2.076,481696,0.005,2.075,8.507,0.162,2.072,2.081 +0.136.0,2024-10-09-22,datefmt_multi_select,2.35,425532,0.01,2.35,11.609,0.191,2.34,2.36 +0.136.0,2024-10-09-22,dedup,1.283,779423,0.006,1.286,2.085,0.134,1.276,1.288 +0.136.0,2024-10-09-22,dedup_sorted,0.989,1011122,0.003,0.988,0.928,0.107,0.987,0.992 +0.136.0,2024-10-09-22,diff,2.645,378072,0.024,2.637,4.132,0.19,2.626,2.672 +0.136.0,2024-10-09-22,enum,0.894,1118568,0.004,0.892,0.845,0.047,0.891,0.898 +0.136.0,2024-10-09-22,enum_constant,0.884,1131222,0.009,0.885,0.835,0.048,0.875,0.892 +0.136.0,2024-10-09-22,enum_copy,0.893,1119821,0.012,0.89,0.845,0.047,0.882,0.906 +0.136.0,2024-10-09-22,enum_hash,1.507,663570,0.006,1.508,1.457,0.05,1.501,1.513 +0.136.0,2024-10-09-22,enum_uuid,1.697,589275,0.002,1.697,0.96,0.735,1.694,1.699 +0.136.0,2024-10-09-22,enum_uuid7,1.73,578035,0.002,1.731,0.991,0.737,1.728,1.732 +0.136.0,2024-10-09-22,excel,12.394,80684,0.025,12.38,12.27,0.921,12.379,12.423 +0.136.0,2024-10-09-22,excel_error_format_formula,21.884,45695,0.007,21.885,21.685,1.006,21.876,21.891 +0.136.0,2024-10-09-22,excel_j1,12.915,77429,0.047,12.901,12.124,0.784,12.877,12.967 +0.136.0,2024-10-09-22,excel_metadata,12.197,81987,0.058,12.222,11.176,1.018,12.131,12.238 +0.136.0,2024-10-09-22,excel_metadata_short,0.949,1053741,0.006,0.946,0.31,0.638,0.946,0.956 +0.136.0,2024-10-09-22,excel_trim,12.549,79688,0.038,12.529,13.758,0.887,12.527,12.593 +0.136.0,2024-10-09-22,excel_trim_j1,14.099,70927,0.031,14.112,13.282,0.812,14.063,14.121 +0.136.0,2024-10-09-22,exclude,0.559,1788909,0.001,0.559,0.517,0.041,0.559,0.56 +0.136.0,2024-10-09-22,exclude_casei,0.564,1773050,0.003,0.565,0.52,0.043,0.561,0.567 +0.136.0,2024-10-09-22,exclude_casei_index,0.578,1730104,0.004,0.578,0.534,0.043,0.575,0.583 +0.136.0,2024-10-09-22,exclude_index,0.574,1742160,0,0.574,0.53,0.042,0.573,0.574 +0.136.0,2024-10-09-22,exclude_multi,0.882,1133787,0.011,0.877,0.841,0.04,0.875,0.894 +0.136.0,2024-10-09-22,exclude_multi_casei,0.89,1123596,0.008,0.888,0.849,0.039,0.884,0.898 +0.136.0,2024-10-09-22,exclude_multi_casei_index,0.913,1095290,0.004,0.913,0.867,0.044,0.909,0.917 +0.136.0,2024-10-09-22,exclude_multi_index,0.904,1106195,0.002,0.905,0.859,0.044,0.902,0.906 +0.136.0,2024-10-09-22,explode,1.587,630120,0.013,1.594,1.542,0.043,1.572,1.594 +0.136.0,2024-10-09-22,extdedup,1.055,947867,0.019,1.046,0.973,0.08,1.043,1.076 +0.136.0,2024-10-09-22,extsort,0.763,1310616,0.01,0.758,1.284,0.264,0.757,0.774 +0.136.0,2024-10-09-22,fill,2.088,478927,0.017,2.097,2.034,0.052,2.068,2.099 +0.136.0,2024-10-09-22,fixlengths,1.337,747943,0.004,1.336,1.248,0.088,1.333,1.341 +0.136.0,2024-10-09-22,flatten,5.604,178444,0.022,5.604,5.534,0.069,5.581,5.625 +0.136.0,2024-10-09-22,flatten_condensed,5.988,167001,0.014,5.995,5.916,0.071,5.971,5.997 +0.136.0,2024-10-09-22,fmt,1.003,997009,0.016,1.012,0.955,0.046,0.985,1.012 +0.136.0,2024-10-09-22,fmt_no_crlf,0.997,1003009,0.011,0.998,0.948,0.048,0.986,1.007 +0.136.0,2024-10-09-22,fmt_no_final_newline,1.006,994036,0.003,1.007,0.957,0.048,1.003,1.008 +0.136.0,2024-10-09-22,foreach,0.008,125000000,0,0.008,0.005,0.002,0.007,0.008 +0.136.0,2024-10-09-22,frequency,2.843,351741,0.016,2.847,3.763,0.157,2.825,2.855 +0.136.0,2024-10-09-22,frequency_ignorecase,3.541,282406,0.013,3.544,4.413,0.151,3.527,3.551 +0.136.0,2024-10-09-22,frequency_ignorecase_index,1.124,889680,0.057,1.151,5.658,0.266,1.058,1.162 +0.136.0,2024-10-09-22,frequency_index,1.048,954198,0.056,1.061,4.837,0.278,0.987,1.096 +0.136.0,2024-10-09-22,frequency_index_stats_mode_auto,1.007,993049,0.009,1.01,4.777,0.286,0.997,1.014 +0.136.0,2024-10-09-22,frequency_index_stats_mode_force,1.002,998004,0.017,1,4.753,0.288,0.985,1.019 +0.136.0,2024-10-09-22,frequency_index_stats_mode_none,1.118,894454,0.009,1.12,5.2,0.287,1.108,1.125 +0.136.0,2024-10-09-22,frequency_j1,3.388,295159,0.03,3.371,3.294,0.091,3.37,3.422 +0.136.0,2024-10-09-22,frequency_j1_ignorecase,4.097,244081,0.044,4.075,3.999,0.095,4.068,4.148 +0.136.0,2024-10-09-22,frequency_limit20,2.863,349284,0.017,2.86,3.767,0.156,2.847,2.882 +0.136.0,2024-10-09-22,frequency_limit20_index,1.063,940734,0.057,1.081,4.816,0.27,1,1.109 +0.136.0,2024-10-09-22,frequency_no_limit,4.727,211551,0.062,4.698,5.593,0.18,4.685,4.798 +0.136.0,2024-10-09-22,frequency_no_limit_index,2.602,384320,0.084,2.557,6.371,0.309,2.55,2.699 +0.136.0,2024-10-09-22,frequency_notrim,2.403,416146,0.009,2.398,3.296,0.154,2.397,2.413 +0.136.0,2024-10-09-22,frequency_notrim_index,0.982,1018330,0.047,0.959,4.255,0.275,0.95,1.037 +0.136.0,2024-10-09-22,frequency_other_sorted,2.839,352237,0.009,2.842,3.706,0.154,2.828,2.845 +0.136.0,2024-10-09-22,frequency_other_sorted_index,1.044,957854,0.044,1.057,4.786,0.279,0.995,1.08 +0.136.0,2024-10-09-22,frequency_selregex,0.803,1245330,0.006,0.806,0.88,0.053,0.796,0.807 +0.136.0,2024-10-09-22,frequency_selregex_ignorecase,1.002,998004,0.004,1.001,1.078,0.055,0.999,1.006 +0.136.0,2024-10-09-22,frequency_sorted,2.67,374532,0.021,2.673,3.504,0.163,2.648,2.69 +0.136.0,2024-10-09-22,frequency_sorted_index,2.759,362450,0.007,2.759,3.568,0.145,2.753,2.766 +0.136.0,2024-10-09-22,geocode_reverse,3.635,275103,0.022,3.638,2.964,10.649,3.612,3.656 +0.136.0,2024-10-09-22,geocode_reverse_batchall,3.706,269833,0.001,3.706,3.307,10.219,3.705,3.707 +0.136.0,2024-10-09-22,geocode_suggest,4.424,226040,0.013,4.42,21.384,2.652,4.414,4.439 +0.136.0,2024-10-09-22,geocode_suggest_batchall,4.436,225428,0.012,4.441,22.09,1.424,4.423,4.445 +0.136.0,2024-10-09-22,index,0.474,2109705,0,0.474,0.43,0.043,0.474,0.474 +0.136.0,2024-10-09-22,input,1.652,605327,0.005,1.651,1.6,0.051,1.648,1.657 +0.136.0,2024-10-09-22,join,1.892,528541,0.001,1.891,1.351,0.539,1.891,1.893 +0.136.0,2024-10-09-22,join_casei,1.899,526593,0.006,1.9,1.359,0.539,1.893,1.905 +0.136.0,2024-10-09-22,joinp,0.791,1264223,0.002,0.791,1.408,0.09,0.79,0.794 +0.136.0,2024-10-09-22,joinp_streaming,0.026,38461538,0,0.026,0.034,0.01,0.026,0.026 +0.136.0,2024-10-09-22,json,18.516,54007,0.053,18.505,16.98,1.526,18.468,18.574 +0.136.0,2024-10-09-22,jsonl,1.446,691563,0.003,1.446,7.012,0.198,1.444,1.449 +0.136.0,2024-10-09-22,jsonl_batchall,0.008,125000000,0,0.008,0.005,0.002,0.008,0.008 +0.136.0,2024-10-09-22,jsonl_j1,5.82,171821,0.011,5.82,5.686,0.132,5.808,5.831 +0.136.0,2024-10-09-22,luau_filter,7.429,134608,0.009,7.424,6.889,0.539,7.423,7.44 +0.136.0,2024-10-09-22,luau_filter_colidx,9.098,109914,0.025,9.092,8.539,0.557,9.077,9.126 +0.136.0,2024-10-09-22,luau_filter_no_globals,4.837,206740,0.007,4.841,4.306,0.53,4.829,4.842 +0.136.0,2024-10-09-22,luau_filter_no_globals_colidx,6.395,156372,0.019,6.405,5.862,0.532,6.374,6.407 +0.136.0,2024-10-09-22,luau_filter_no_globals_colidx,6.388,156544,0.011,6.394,5.851,0.536,6.376,6.395 +0.136.0,2024-10-09-22,luau_filter_no_globals_no_colidx,4.827,207168,0.01,4.822,4.299,0.527,4.821,4.839 +0.136.0,2024-10-09-22,luau_multi,23.365,42799,0.068,23.364,22.727,0.636,23.297,23.433 +0.136.0,2024-10-09-22,luau_multi_colidx,24.958,40067,0.027,24.947,24.302,0.654,24.938,24.988 +0.136.0,2024-10-09-22,luau_multi_no_globals,20.823,48024,0.297,20.659,20.182,0.639,20.643,21.166 +0.136.0,2024-10-09-22,luau_multi_no_globals_colidx,22.248,44948,0.074,22.24,21.591,0.655,22.178,22.325 +0.136.0,2024-10-09-22,luau_script,33.838,29553,0.05,33.84,33.181,0.655,33.786,33.887 +0.136.0,2024-10-09-22,luau_script_colidx,35.363,28278,0.042,35.379,34.698,0.663,35.315,35.394 +0.136.0,2024-10-09-22,luau_script_no_globals,31.057,32199,0.037,31.065,30.402,0.653,31.017,31.089 +0.136.0,2024-10-09-22,luau_script_no_globals_colidx,32.39,30874,0.101,32.333,31.734,0.654,32.33,32.506 +0.136.0,2024-10-09-22,partition,2.327,429738,0.015,2.324,0.937,1.361,2.314,2.343 +0.136.0,2024-10-09-22,pseudo,1.715,583090,0.001,1.715,1.657,0.056,1.714,1.716 +0.136.0,2024-10-09-22,pseudo_formatstr,1.927,518941,0.01,1.929,1.867,0.059,1.916,1.936 +0.136.0,2024-10-09-22,rename,0.987,1013171,0.001,0.987,0.939,0.047,0.987,0.988 +0.136.0,2024-10-09-22,replace,2.09,478469,0.005,2.091,2.038,0.051,2.085,2.095 +0.136.0,2024-10-09-22,reverse,0.975,1025641,0.004,0.975,0.866,0.104,0.972,0.979 +0.136.0,2024-10-09-22,reverse_index,6.118,163452,0.03,6.102,1.246,4.86,6.099,6.152 +0.136.0,2024-10-09-22,safenames,0.987,1013171,0.002,0.987,0.94,0.047,0.986,0.989 +0.136.0,2024-10-09-22,sample_10,0.518,1930502,0.003,0.516,0.472,0.044,0.516,0.521 +0.136.0,2024-10-09-22,sample_1000,0.514,1945525,0.001,0.513,0.471,0.042,0.513,0.516 +0.136.0,2024-10-09-22,sample_100000,0.651,1536098,0.004,0.65,0.603,0.047,0.647,0.655 +0.136.0,2024-10-09-22,sample_100000_index,1.142,875657,0.002,1.143,0.181,0.96,1.139,1.143 +0.136.0,2024-10-09-22,sample_100000_seeded,0.648,1543210,0.001,0.648,0.599,0.047,0.647,0.648 +0.136.0,2024-10-09-22,sample_100000_seeded_faster,0.637,1569859,0.003,0.637,0.59,0.046,0.635,0.64 +0.136.0,2024-10-09-22,sample_100000_seeded_index,1.134,881834,0.008,1.133,0.178,0.955,1.127,1.143 +0.136.0,2024-10-09-22,sample_100000_seeded_index_faster,1.119,893655,0.004,1.118,0.172,0.946,1.115,1.124 +0.136.0,2024-10-09-22,sample_100000_seeded_index_secure,1.13,884956,0.014,1.125,0.174,0.954,1.119,1.145 +0.136.0,2024-10-09-22,sample_100000_seeded_secure,0.644,1552795,0.006,0.641,0.596,0.047,0.64,0.651 +0.136.0,2024-10-09-22,sample_1000_index,0.033,30303030,0,0.033,0.017,0.015,0.033,0.033 +0.136.0,2024-10-09-22,sample_10_index,0.019,52631579,0,0.019,0.015,0.003,0.019,0.019 +0.136.0,2024-10-09-22,sample_25pct_index,2.784,359195,0.011,2.781,0.417,2.365,2.775,2.796 +0.136.0,2024-10-09-22,sample_25pct_seeded_index,2.781,359583,0.004,2.781,0.42,2.36,2.777,2.785 +0.136.0,2024-10-09-22,schema,7.817,127926,0.041,7.831,13.273,0.393,7.771,7.848 +0.136.0,2024-10-09-22,schema_index,0.111,9009009,0,0.111,1.024,0.062,0.11,0.111 +0.136.0,2024-10-09-22,search,0.587,1703578,0.001,0.586,0.542,0.044,0.586,0.588 +0.136.0,2024-10-09-22,search_file,1.069,935454,0.002,1.07,1.022,0.046,1.067,1.071 +0.136.0,2024-10-09-22,search_file_case_sensitive,0.984,1016260,0.001,0.984,0.937,0.046,0.984,0.985 +0.136.0,2024-10-09-22,search_file_case_sensitive_unicode,0.984,1016260,0.002,0.983,0.935,0.048,0.982,0.985 +0.136.0,2024-10-09-22,search_file_flag,1.253,798085,0.001,1.253,1.203,0.049,1.252,1.254 +0.136.0,2024-10-09-22,search_file_flag_matchonly,0.841,1189061,0.003,0.84,0.794,0.046,0.839,0.844 +0.136.0,2024-10-09-22,search_file_literal,0.942,1061571,0.004,0.943,0.894,0.047,0.937,0.946 +0.136.0,2024-10-09-22,search_file_unicode,1.088,919118,0.002,1.088,1.04,0.047,1.087,1.09 +0.136.0,2024-10-09-22,search_unicode,0.586,1706485,0.001,0.586,0.542,0.044,0.585,0.587 +0.136.0,2024-10-09-22,searchset,1.248,801282,0.003,1.247,1.569,0.104,1.245,1.252 +0.136.0,2024-10-09-22,searchset_ignorecase,1.441,693963,0.001,1.441,1.76,0.104,1.439,1.441 +0.136.0,2024-10-09-22,searchset_unicode,1.245,803213,0.005,1.244,1.559,0.102,1.241,1.25 +0.136.0,2024-10-09-22,select,0.491,2036660,0,0.491,0.447,0.043,0.491,0.492 +0.136.0,2024-10-09-22,select_regex,0.527,1897533,0.002,0.526,0.482,0.044,0.525,0.529 +0.136.0,2024-10-09-22,slice_last_1k,0.009,111111111,0,0.009,0.006,0.002,0.009,0.009 +0.136.0,2024-10-09-22,slice_last_1k_index,0.009,111111111,0,0.009,0.006,0.002,0.009,0.009 +0.136.0,2024-10-09-22,slice_last_1k_json,0.012,83333333,0,0.012,0.009,0.002,0.012,0.012 +0.136.0,2024-10-09-22,slice_last_1k_json_index,0.012,83333333,0,0.012,0.009,0.003,0.012,0.012 +0.136.0,2024-10-09-22,slice_one_middle,0.257,3891051,0.001,0.257,0.232,0.024,0.256,0.258 +0.136.0,2024-10-09-22,slice_one_middle_index,0.008,125000000,0,0.008,0.005,0.002,0.008,0.008 +0.136.0,2024-10-09-22,snappy_compress,0.175,5714286,0.006,0.179,1.162,0.107,0.168,0.179 +0.136.0,2024-10-09-22,snappy_decompress,0.495,2020202,0.002,0.495,0.473,0.021,0.493,0.498 +0.136.0,2024-10-09-22,snappy_validate,0.478,2092050,0,0.478,0.457,0.019,0.477,0.478 +0.136.0,2024-10-09-22,sort,1.245,803213,0.004,1.244,2.116,0.128,1.241,1.249 +0.136.0,2024-10-09-22,sort_random_seeded,1.176,850340,0.017,1.172,1.064,0.107,1.16,1.194 +0.136.0,2024-10-09-22,sort_random_seeded_faster,1.165,858369,0.012,1.168,1.048,0.113,1.153,1.176 +0.136.0,2024-10-09-22,sort_random_seeded_secure,1.175,851064,0.011,1.169,1.065,0.106,1.169,1.188 +0.136.0,2024-10-09-22,sortcheck_sorted,0.498,2008032,0,0.499,0.454,0.043,0.498,0.499 +0.136.0,2024-10-09-22,sortcheck_unsorted,0.008,125000000,0,0.007,0.005,0.002,0.007,0.008 +0.136.0,2024-10-09-22,sortcheck_unsorted_all,0.533,1876173,0.003,0.533,0.487,0.045,0.53,0.535 +0.136.0,2024-10-09-22,split,0.972,1028807,0.05,0.965,0.817,0.107,0.926,1.024 +0.136.0,2024-10-09-22,split_chunks,1.016,984252,0.029,1.025,1.239,0.156,0.984,1.039 +0.136.0,2024-10-09-22,split_chunks_index,0.237,4219409,0.115,0.203,1.065,0.205,0.143,0.365 +0.136.0,2024-10-09-22,split_chunks_index_j1,0.009,111111111,0,0.009,0.005,0.004,0.009,0.009 +0.136.0,2024-10-09-22,split_index,0.207,4830918,0.057,0.239,1.044,0.2,0.141,0.24 +0.136.0,2024-10-09-22,split_index_j1,1.027,973710,0.038,1.01,0.88,0.112,0.999,1.071 +0.136.0,2024-10-09-22,split_kbsize,2.037,490918,0.043,2.019,1.898,0.11,2.005,2.086 +0.136.0,2024-10-09-22,sqlp,1.047,955110,0.014,1.046,2.012,0.273,1.033,1.061 +0.136.0,2024-10-09-22,sqlp_aggregations,0.992,1008065,0.011,0.985,1.375,0.057,0.985,1.004 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive,0.858,1165501,0.011,0.854,1.429,0.097,0.85,0.87 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive_streaming,0.859,1164144,0.008,0.856,1.429,0.092,0.853,0.868 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive_vs_duckdb,0.22,4545455,0.005,0.218,0.88,0.123,0.217,0.226 +0.136.0,2024-10-09-22,sqlp_aggregations_vs_duckdb,0.194,5154639,0.005,0.196,0.615,0.102,0.189,0.198 +0.136.0,2024-10-09-22,sqlp_format_arrow,1.736,576037,0.003,1.736,2.213,0.727,1.734,1.74 +0.136.0,2024-10-09-22,sqlp_format_avro,1.03,970874,0.005,1.029,1.986,0.374,1.026,1.036 +0.136.0,2024-10-09-22,sqlp_format_json,1.023,977517,0.001,1.024,1.965,0.259,1.021,1.024 +0.136.0,2024-10-09-22,sqlp_format_jsonl,1.024,976562,0.002,1.023,1.964,0.256,1.022,1.026 +0.136.0,2024-10-09-22,sqlp_format_parquet,1.025,975610,0.002,1.024,1.976,0.271,1.024,1.027 +0.136.0,2024-10-09-22,sqlp_format_parquet_statistics,1.025,975610,0.002,1.025,1.974,0.28,1.023,1.027 +0.136.0,2024-10-09-22,sqlp_lowmemory,1.015,985222,0.002,1.015,1.963,0.254,1.013,1.017 +0.136.0,2024-10-09-22,sqlp_nooptimizations,1.048,954198,0.001,1.047,2.015,0.187,1.047,1.049 +0.136.0,2024-10-09-22,sqlp_tryparsedates,11.918,83907,0.014,11.914,12.857,0.265,11.907,11.934 +0.136.0,2024-10-09-22,sqlp_tryparsedates_inferlen,1.918,521376,0.006,1.92,2.864,0.25,1.911,1.923 +0.136.0,2024-10-09-22,stats,1.91,523560,0.005,1.911,2.219,0.095,1.904,1.914 +0.136.0,2024-10-09-22,stats_create_cache,1.91,523560,0.005,1.912,2.22,0.093,1.905,1.913 +0.136.0,2024-10-09-22,stats_everything,3.355,298063,0.04,3.344,10.192,0.393,3.322,3.4 +0.136.0,2024-10-09-22,stats_everything_create_cache,3.39,294985,0.019,3.382,10.238,0.393,3.376,3.411 +0.136.0,2024-10-09-22,stats_everything_index,1.331,751315,0.009,1.329,10.826,0.567,1.323,1.341 +0.136.0,2024-10-09-22,stats_everything_index_j1,6.615,151172,0.154,6.637,6.551,0.212,6.451,6.757 +0.136.0,2024-10-09-22,stats_everything_index_j1_with_cache,0.008,125000000,0,0.008,0.005,0.002,0.008,0.008 +0.136.0,2024-10-09-22,stats_everything_index_with_cache,0.008,125000000,0,0.008,0.005,0.003,0.008,0.008 +0.136.0,2024-10-09-22,stats_everything_infer_dates,7.172,139431,0.002,7.172,14.419,0.436,7.171,7.174 +0.136.0,2024-10-09-22,stats_everything_infer_dates_index,1.893,528262,0.026,1.904,17.085,0.587,1.863,1.912 +0.136.0,2024-10-09-22,stats_everything_infer_dates_index_with_cache,0.008,125000000,0,0.008,0.005,0.003,0.008,0.008 +0.136.0,2024-10-09-22,stats_everything_j1,6.375,156863,0.039,6.378,6.588,0.261,6.335,6.412 +0.136.0,2024-10-09-22,stats_everything_sorted,3.309,302206,0.016,3.309,9.845,0.404,3.293,3.325 +0.136.0,2024-10-09-22,stats_everything_sorted_index,3.328,300481,0.007,3.327,9.829,0.41,3.322,3.335 +0.136.0,2024-10-09-22,stats_index,0.226,4424779,0.005,0.225,2.294,0.066,0.223,0.232 +0.136.0,2024-10-09-22,stats_index_j1,1.886,530223,0.001,1.886,1.844,0.04,1.884,1.887 +0.136.0,2024-10-09-22,stats_index_j1_with_cache,0.008,125000000,0,0.008,0.005,0.002,0.008,0.008 +0.136.0,2024-10-09-22,stats_index_with_cache,0.008,125000000,0,0.008,0.005,0.003,0.008,0.008 +0.136.0,2024-10-09-22,table,6.831,146391,0.023,6.828,5.733,1.09,6.81,6.856 +0.136.0,2024-10-09-22,to_datapackage,0.695,1438849,0.004,0.694,2.098,0.302,0.693,0.699 +0.136.0,2024-10-09-22,to_sqlite,0.687,1455604,0.006,0.689,2.08,0.294,0.68,0.692 +0.136.0,2024-10-09-22,to_xlsx,30.796,32472,0.034,30.809,29.307,3.064,30.758,30.822 +0.136.0,2024-10-09-22,tojsonl,5.256,190259,0.029,5.258,10.162,0.415,5.225,5.284 +0.136.0,2024-10-09-22,tojsonl_batchall,5.279,189430,0.019,5.275,10.091,0.378,5.263,5.299 +0.136.0,2024-10-09-22,tojsonl_index,2.896,345304,0.08,2.868,11.432,0.44,2.833,2.986 +0.136.0,2024-10-09-22,tojsonl_index_j1,8.996,111161,0.02,9.003,8.608,0.382,8.973,9.01 +0.136.0,2024-10-09-22,tojsonl_j1,9.05,110497,0.042,9.033,9.022,0.43,9.02,9.098 +0.136.0,2024-10-09-22,tojsonl_trim,5.349,186951,0.007,5.349,10.951,0.386,5.342,5.356 +0.136.0,2024-10-09-22,tojsonl_trim_j1,9.571,104482,0.015,9.568,9.526,0.449,9.558,9.588 +0.136.0,2024-10-09-22,transpose,2.623,381243,0.026,2.612,2.418,0.199,2.605,2.652 +0.136.0,2024-10-09-22,transpose_multipass,21.423,46679,0.021,21.414,19.688,1.733,21.407,21.447 +0.136.0,2024-10-09-22,validate,1.345,743494,0.011,1.345,6.257,0.152,1.335,1.357 +0.136.0,2024-10-09-22,validate_batchall,1.282,780031,0.008,1.284,5.824,0.115,1.274,1.289 +0.136.0,2024-10-09-22,validate_batchall_index,1.286,777605,0.004,1.286,5.746,0.125,1.281,1.29 +0.136.0,2024-10-09-22,validate_dynenum,1.32,757576,0.005,1.318,6.184,0.147,1.317,1.326 +0.136.0,2024-10-09-22,validate_dynenum_batchall,1.273,785546,0.003,1.273,5.749,0.11,1.27,1.275 +0.136.0,2024-10-09-22,validate_dynenum_batchall_index,1.272,786164,0.006,1.273,5.688,0.122,1.265,1.277 +0.136.0,2024-10-09-22,validate_dynenum_index,1.27,787402,0.001,1.269,5.679,0.104,1.269,1.271 +0.136.0,2024-10-09-22,validate_dynenum_no_schema,1.332,750751,0.009,1.335,6.149,0.149,1.322,1.338 +0.136.0,2024-10-09-22,validate_dynenum_no_schema_index,1.268,788644,0.004,1.27,5.694,0.102,1.263,1.271 +0.136.0,2024-10-09-22,validate_dynenum_valid_output,2.202,454133,0.003,2.203,7.002,0.193,2.199,2.204 +0.136.0,2024-10-09-22,validate_dynenum_valid_output_index,2.135,468384,0.004,2.136,6.539,0.143,2.131,2.138 +0.136.0,2024-10-09-22,validate_index,1.282,780031,0.003,1.283,5.739,0.102,1.278,1.285 +0.136.0,2024-10-09-22,validate_no_schema,0.488,2049180,0.004,0.489,0.443,0.044,0.484,0.491 +0.136.0,2024-10-09-22,validate_no_schema_index,0.495,2020202,0.001,0.494,0.454,0.039,0.494,0.496 +0.136.0,2024-10-09-22,validate_valid_output,2.213,451875,0.004,2.215,7.102,0.194,2.208,2.215 +0.136.0,2024-10-09-22,validate_valid_output_index,2.143,466636,0.008,2.139,6.604,0.14,2.139,2.153 0.135.0,2024-09-24-16,apply_calcconv,1.656,603865,0.025,1.647,3.308,0.189,1.638,1.684 0.135.0,2024-09-24-16,apply_dynfmt,2.084,479846,0.102,2.039,5.59,0.19,2.011,2.2 0.135.0,2024-09-24-16,apply_emptyreplace,1.525,655738,0.004,1.524,2.247,0.159,1.522,1.529 diff --git a/scripts/results/latest_results.csv b/scripts/results/latest_results.csv index 024bf4948..47b49b6bc 100644 --- a/scripts/results/latest_results.csv +++ b/scripts/results/latest_results.csv @@ -1,234 +1,235 @@ version,tstamp,name,mean,stddev,median,user,system,min,max,recs_per_sec -0.135.0,2024-09-24-16,apply_calcconv,1.6561738473333334,0.024501838167906777,1.647001417,3.308348666666666,0.18862966666666667,1.6375816250000002,1.6839385,603865 -0.135.0,2024-09-24-16,apply_dynfmt,2.0836702916666665,0.10193847447027907,2.039226917,5.589999,0.19001333333333328,2.011498958,2.200285,479846 -0.135.0,2024-09-24-16,apply_emptyreplace,1.52477925,0.0037840099082099747,1.5236345,2.2465766666666664,0.15877699999999997,1.521699791,1.5290034590000001,655738 -0.135.0,2024-09-24-16,apply_op_eudex,1.4709690000000002,0.004635660128896329,1.4701724170000001,1.6751436666666664,0.17626266666666668,1.46678325,1.475951333,679810 -0.135.0,2024-09-24-16,apply_op_similarity,1.5026988053333332,0.036124805945877386,1.489682083,1.8088156666666666,0.16535066666666665,1.47488625,1.543528083,665336 -0.135.0,2024-09-24-16,apply_op_similarity_batchall,1.6281318333333334,0.013907561155338224,1.620127667,2.1134876666666664,0.42611133333333334,1.620076958,1.644190875,614251 -0.135.0,2024-09-24-16,apply_op_string,1.5299231526666668,0.006093751682075404,1.5332260830000002,2.2687676666666667,0.16097799999999998,1.522891,1.533652375,653595 -0.135.0,2024-09-24-16,behead,0.918768736,0.004973469597498395,0.918477792,0.8748636666666667,0.04174366666666666,0.9139471250000001,0.923881291,1088139 -0.135.0,2024-09-24-16,behead_flexible,0.9622273333333333,0.010936315635725875,0.9673666670000001,0.9192056666666666,0.04088366666666666,0.9496680000000001,0.9696473330000001,1039501 -0.135.0,2024-09-24-16,cat_columns,2.9672721806666664,0.006632064974674839,2.964664042,2.8792030000000004,0.08591366666666667,2.9623406670000003,2.974811833,337041 -0.135.0,2024-09-24-16,cat_rows,1.8585314860000002,0.013529493584203123,1.863656125,1.773313,0.08310933333333333,1.843188292,1.8687500410000002,537924 -0.135.0,2024-09-24-16,cat_rows_flexible,1.8210944586666666,0.0035848271599336215,1.821112792,1.735658,0.08283,1.8175005,1.824670084,549149 -0.135.0,2024-09-24-16,cat_rowskey,3.2519307360000003,0.007564855727902409,3.255147208,3.163473666666667,0.085781,3.243289167,3.257355833,307503 -0.135.0,2024-09-24-16,count,0.06696670833333333,0.0011541730024014311,0.067063833,0.42164599999999997,0.052527333333333336,0.065767042,0.06806925,14925373 -0.135.0,2024-09-24-16,count_flexible,0.461102903,0.0006918031658015236,0.461196667,0.41882133333333327,0.040195666666666664,0.46036900000000003,0.46174304200000005,2169197 -0.135.0,2024-09-24-16,count_index,0.00899275,0.0001997653310762403,0.0090735,0.0048839999999999995,0.0021863333333333335,0.00876525,0.0091395,111111111 -0.135.0,2024-09-24-16,count_no_polars,0.46324341666666663,0.0018735530965847498,0.46391216700000004,0.4188686666666666,0.04220566666666666,0.46112725000000004,0.464690833,2159827 -0.135.0,2024-09-24-16,count_polars_lowmem,0.05739552766666667,0.0019269810759188401,0.057406417,0.40767766666666666,0.052958,0.055463125,0.059317041,17543860 -0.135.0,2024-09-24-16,count_width,0.4720493053333334,0.0063411977419242675,0.46867375000000006,0.44237566666666667,0.04803866666666667,0.468109916,0.47936425000000005,2118644 -0.135.0,2024-09-24-16,count_width_index,0.4930666803333334,0.016577662597782625,0.48702358300000004,0.4615993333333333,0.04938033333333333,0.480358333,0.511818125,2028398 -0.135.0,2024-09-24-16,datefmt,1.7239539170000002,0.0008712174631737919,1.723511875,4.3592059999999995,0.16835433333333336,1.7233923340000001,1.724957542,580046 -0.135.0,2024-09-24-16,datefmt_formatstr_newcol,1.6420713473333333,0.0029293991572182767,1.6422111670000001,3.5576136666666662,0.1609013333333333,1.639074542,1.6449283330000002,609013 -0.135.0,2024-09-24-16,datefmt_multi,2.084292222333333,0.004543992222203884,2.083293625,8.462918333333333,0.16106233333333334,2.080330583,2.089252459,479846 -0.135.0,2024-09-24-16,datefmt_multi_batchall,2.198415069666667,0.002017713862499274,2.197255417,8.809954666666668,0.33751266666666674,2.197244875,2.200744917,454959 -0.135.0,2024-09-24-16,datefmt_multi_select,2.323542041666667,0.009982654504934525,2.319155041,11.266940333333332,0.165737,2.316504125,2.334966959,430293 -0.135.0,2024-09-24-16,dedup,1.3292426666666668,0.007532498844117107,1.333438292,2.1353999999999997,0.12802433333333332,1.320546666,1.333743042,752445 -0.135.0,2024-09-24-16,dedup_sorted,1.0500485560000001,0.0009519418556104904,1.050291417,0.9836123333333333,0.11224633333333334,1.048998709,1.050855542,952381 -0.135.0,2024-09-24-16,diff,2.6763047776666666,0.007511558345475026,2.676419959,4.152838666666667,0.18858233333333332,2.668736291,2.683758083,373692 -0.135.0,2024-09-24-16,enum,0.9571187776666666,0.00011805103588841627,0.95709675,0.9137013333333334,0.041293666666666666,0.9570132920000001,0.957246291,1044932 -0.135.0,2024-09-24-16,enum_constant,0.9562375833333334,0.002867278690866569,0.955433708,0.910845,0.04322433333333333,0.953858042,0.9594210000000001,1046025 -0.135.0,2024-09-24-16,enum_copy,0.9635569446666667,0.0005196529601083491,0.963711834,0.9193009999999999,0.04212933333333333,0.962977458,0.9639815420000001,1037344 -0.135.0,2024-09-24-16,enum_hash,1.5690463053333332,0.0010462513805603616,1.5693133330000002,1.5240173333333331,0.042890000000000005,1.567892417,1.569933166,637349 -0.135.0,2024-09-24-16,enum_uuid,1.759877972,0.002356836554815807,1.760088542,1.0258450000000001,0.7319576666666666,1.757422916,1.762122458,568182 -0.135.0,2024-09-24-16,enum_uuid7,1.7865406110000002,0.003500286745534298,1.788556,1.0520086666666666,0.732247,1.782498833,1.788567,559597 -0.135.0,2024-09-24-16,excel,12.407360777666668,0.013094151597758374,12.411723291,12.295141,0.8913913333333333,12.39264225,12.417716792,80600 -0.135.0,2024-09-24-16,excel_error_format_formula,22.405064653,0.22214262043431157,22.300814542,22.123282,1.0543763333333331,22.254221042,22.660158375,44633 -0.135.0,2024-09-24-16,excel_j1,12.996690555666667,0.033254665839665305,12.9904405,12.195943999999999,0.7920936666666666,12.967004375,13.032626792,76941 -0.135.0,2024-09-24-16,excel_metadata,11.952817139333334,0.04348827542480086,11.948436167,11.183247,0.7588929999999999,11.911685167,11.998330084000001,83661 -0.135.0,2024-09-24-16,excel_metadata_short,0.522740764,0.008032774392137298,0.5240271670000001,0.16315733333333335,0.3567106666666666,0.5141424170000001,0.530052708,1912046 -0.135.0,2024-09-24-16,excel_trim,12.618090083333334,0.025927595122572866,12.606782792,13.621176666666665,1.0246179999999998,12.599736458,12.647751,79252 -0.135.0,2024-09-24-16,excel_trim_j1,14.112316291333334,0.00887533405014761,14.113719125,13.218260999999998,0.8820646666666666,14.102823083,14.120406666,70862 -0.135.0,2024-09-24-16,exclude,0.5736019166666667,0.0025727979095399906,0.57331475,0.5309276666666666,0.039996333333333335,0.57118475,0.57630625,1742160 -0.135.0,2024-09-24-16,exclude_casei,0.5782353196666667,0.0007637199804642468,0.5786370000000001,0.5363053333333333,0.039253666666666666,0.5773545840000001,0.578714375,1730104 -0.135.0,2024-09-24-16,exclude_casei_index,0.5852502780000001,0.006363961176733965,0.585642375,0.5430256666666665,0.03989499999999999,0.5786993340000001,0.5914091250000001,1709402 -0.135.0,2024-09-24-16,exclude_index,0.5787455973333334,0.005896403261463427,0.576707167,0.5367526666666667,0.03968566666666667,0.574138875,0.5853907500000001,1727116 -0.135.0,2024-09-24-16,exclude_multi,0.9076274863333333,0.0038674071804712406,0.9086384590000001,0.8633686666666667,0.041767,0.903355,0.9108890000000001,1101322 -0.135.0,2024-09-24-16,exclude_multi_casei,0.9310793473333333,0.02900865121910744,0.916849208,0.888016,0.040321666666666665,0.9119336250000001,0.964455209,1074114 -0.135.0,2024-09-24-16,exclude_multi_casei_index,0.918678528,0.0037736147407016448,0.920563459,0.8755213333333333,0.04095466666666667,0.9143337920000001,0.921138333,1088139 -0.135.0,2024-09-24-16,exclude_multi_index,0.9204265420000001,0.009361009899892699,0.9205869170000001,0.8766863333333333,0.041166666666666664,0.910986375,0.929706334,1086957 -0.135.0,2024-09-24-16,explode,1.6107419026666667,0.004866132637237246,1.6128365420000002,1.563614333333333,0.044317999999999996,1.605179208,1.614209958,620732 -0.135.0,2024-09-24-16,extdedup,1.067265653,0.012404302795964581,1.068366,0.9814016666666667,0.081241,1.054347834,1.079083125,937207 -0.135.0,2024-09-24-16,extsort,0.8386422776666667,0.035784224237229316,0.823837708,1.3196759999999998,0.29566299999999995,0.8126360420000001,0.879453083,1191895 -0.135.0,2024-09-24-16,fill,2.141778291666667,0.008741486049120752,2.144456583,2.0914179999999996,0.04634733333333333,2.132011,2.148867292,466853 -0.135.0,2024-09-24-16,fixlengths,1.4535256113333332,0.005173087035179922,1.45482025,1.3682683333333332,0.08247633333333333,1.447828167,1.457928417,687758 -0.135.0,2024-09-24-16,flatten,5.631412597000001,0.0072111346127681575,5.633534958,5.576536333333333,0.051366666666666665,5.623378458,5.637324375,177588 -0.135.0,2024-09-24-16,flatten_condensed,6.138042236333334,0.059734858038596836,6.118565834,6.082980666666667,0.051888,6.090476375,6.2050845,162920 -0.135.0,2024-09-24-16,fmt,1.0300456943333334,0.0036681274449213083,1.029574667,0.9828433333333333,0.044510333333333325,1.026635833,1.033926583,970874 -0.135.0,2024-09-24-16,fmt_no_crlf,1.032838611,0.007629201759700244,1.031791583,0.9860773333333333,0.04427066666666666,1.025787,1.04093725,968054 -0.135.0,2024-09-24-16,fmt_no_final_newline,1.0341688056666667,0.0025591625386629535,1.033542042,0.9875346666666666,0.04334,1.03198125,1.036983125,967118 -0.135.0,2024-09-24-16,foreach,0.006671138666666667,0.00006106024931437279,0.006689208,0.004203666666666667,0.0014823333333333331,0.006603083,0.0067211250000000005,142857143 -0.135.0,2024-09-24-16,frequency,3.1963116389999997,0.06532450376844255,3.219018666,4.1295416666666656,0.151076,3.122663792,3.247252459,312891 -0.135.0,2024-09-24-16,frequency_ignorecase,3.8656460136666664,0.0811792896967743,3.888340791,4.806199333333333,0.1503173333333333,3.7755345,3.93306275,258665 -0.135.0,2024-09-24-16,frequency_ignorecase_index,1.151306375,0.03892438847664332,1.173626625,5.847245999999999,0.298613,1.106360708,1.173931792,868810 -0.135.0,2024-09-24-16,frequency_index,1.0664998609999998,0.04197956631418569,1.059359167,4.922676,0.2937193333333334,1.028548625,1.111591791,938086 -0.135.0,2024-09-24-16,frequency_index_stats_mode_auto,1.0824912083333333,0.04314030729787649,1.096318625,4.949983333333333,0.30059966666666665,1.0341325,1.1170225,924214 -0.135.0,2024-09-24-16,frequency_index_stats_mode_force,1.0689637633333333,0.03210656709556349,1.072836666,4.910664999999999,0.2876026666666666,1.035096416,1.098958208,935454 -0.135.0,2024-09-24-16,frequency_index_stats_mode_none,1.204983806,0.043028164990857536,1.224189917,5.400438333333334,0.30676699999999996,1.155697417,1.235064084,829876 -0.135.0,2024-09-24-16,frequency_j1,3.8380692913333334,0.06275029208618856,3.804467708,3.744283333333333,0.08838299999999999,3.7992751250000003,3.910465041,260552 -0.135.0,2024-09-24-16,frequency_j1_ignorecase,4.422148597666666,0.12451558138431715,4.462734792,4.328220666666666,0.08834666666666664,4.282403834,4.521307167,226142 -0.135.0,2024-09-24-16,frequency_limit20,3.133674013666667,0.03161891719242739,3.127625458,4.078483666666667,0.15589699999999998,3.105516291,3.167880292,319081 -0.135.0,2024-09-24-16,frequency_limit20_index,1.0761722363333333,0.0695473478197952,1.080959458,4.951464666666666,0.284047,1.004354959,1.143202292,929368 -0.135.0,2024-09-24-16,frequency_no_limit,5.060725708333334,0.01524724816893902,5.064347375,5.984163,0.17553699999999997,5.043993708,5.073836042,197589 -0.135.0,2024-09-24-16,frequency_no_limit_index,2.736244889,0.09661334239009267,2.708049958,6.638563333333333,0.322608,2.6568655,2.8438192090000003,365497 -0.135.0,2024-09-24-16,frequency_notrim,2.655396902666667,0.046896941701464384,2.634113,3.581782666666667,0.14866266666666667,2.622916125,2.709161583,376648 -0.135.0,2024-09-24-16,frequency_notrim_index,1.012327097,0.013312405203440444,1.015763583,4.405483333333333,0.30316533333333334,0.997633375,1.023584333,988142 -0.135.0,2024-09-24-16,frequency_other_sorted,3.2561983753333337,0.15079996725149042,3.21875275,4.225429666666667,0.15031599999999998,3.127649334,3.422193042,307125 -0.135.0,2024-09-24-16,frequency_other_sorted_index,1.0926134583333333,0.03106780889609415,1.109147292,5.041124666666666,0.296475,1.056775125,1.111917958,914913 -0.135.0,2024-09-24-16,frequency_selregex,0.8351828470000001,0.008723118527202067,0.838418791,0.9158883333333333,0.057894,0.8253041670000001,0.8418255830000001,1197605 -0.135.0,2024-09-24-16,frequency_selregex_ignorecase,1.0209325276666668,0.015728629606828744,1.019474958,1.0941743333333331,0.059975999999999995,1.005983417,1.037339208,979432 -0.135.0,2024-09-24-16,frequency_sorted,2.9222854586666664,0.06648564345737881,2.903838625,3.7744139999999997,0.15414333333333333,2.8669710840000002,2.996046667,342231 -0.135.0,2024-09-24-16,frequency_sorted_index,3.009924208333333,0.10678435574180659,2.99290325,3.8734683333333333,0.1593083333333333,2.912672625,3.12419675,332226 -0.135.0,2024-09-24-16,geocode_reverse,3.6880939306666662,0.00863975145930737,3.684070041,2.9772176666666663,10.374819,3.682200084,3.6980116670000003,271150 -0.135.0,2024-09-24-16,geocode_reverse_batchall,3.897372819333333,0.027767561244194317,3.909226625,3.3624283333333334,10.494807333333334,3.865645666,3.917246167,256608 -0.135.0,2024-09-24-16,geocode_suggest,4.592817819666666,0.004441685384265679,4.592079792,21.658865000000002,2.560424,4.588791375,4.597582292,217723 -0.135.0,2024-09-24-16,geocode_suggest_batchall,4.572229444333334,0.028855180910389664,4.586816,22.194153333333333,1.4211766666666668,4.538993041,4.590879292,218723 -0.135.0,2024-09-24-16,index,0.4797199863333333,0.0025052446830691674,0.480966,0.4356623333333333,0.04165833333333333,0.476836042,0.481357917,2083333 -0.135.0,2024-09-24-16,input,1.6647838053333333,0.0028998328398934865,1.665573958,1.6172893333333331,0.04491233333333333,1.661570791,1.667206667,600601 -0.135.0,2024-09-24-16,join,2.0172392776666666,0.015012515482451774,2.016704666,1.3798323333333335,0.6335263333333333,2.002501209,2.032511958,495786 -0.135.0,2024-09-24-16,join_casei,2.033957111,0.030735016833308194,2.024246667,1.4049996666666666,0.6254603333333333,2.009250166,2.0683745,491642 -0.135.0,2024-09-24-16,joinp,0.04221793033333334,0.0004218237739819001,0.042042208000000005,0.12950899999999999,0.020065333333333334,0.041912375,0.042699208,23809524 -0.135.0,2024-09-24-16,joinp_streaming,0.02875315266666667,0.00034883999598144314,0.028889625000000002,0.04144833333333333,0.009215666666666665,0.028356708,0.029013125,34482759 -0.135.0,2024-09-24-16,json,22.288082791333334,0.6040361235228544,22.144090916,17.532103,3.8630259999999996,21.769054708,22.95110275,44867 -0.135.0,2024-09-24-16,jsonl,1.4996724026666666,0.02129130394532689,1.51019975,7.031866,0.20919166666666666,1.4751681250000002,1.513649333,666667 -0.135.0,2024-09-24-16,jsonl_batchall,0.0065708196666666675,0.00003340283739943842,0.006589917000000001,0.0042313333333333335,0.0014026666666666666,0.006532250000000001,0.006590292,142857143 -0.135.0,2024-09-24-16,jsonl_j1,5.92287075,0.020504258228094523,5.913642625,5.791743666666666,0.12608533333333333,5.908602125,5.9463675,168833 -0.135.0,2024-09-24-16,luau_filter,7.528790555333334,0.06770994502347814,7.505211792,6.999822333333333,0.5246979999999999,7.476022458,7.605137416,132820 -0.135.0,2024-09-24-16,luau_filter_colidx,9.125884777666668,0.020188361167090923,9.119456125,8.59906,0.522716,9.109693583,9.148504625,109577 -0.135.0,2024-09-24-16,luau_filter_no_globals,4.8728402363333325,0.003538084860333533,4.874713417,4.357304666666666,0.5123763333333332,4.868759375,4.875047917,205212 -0.135.0,2024-09-24-16,luau_filter_no_globals_colidx,6.480947819333333,0.08335385896378719,6.43424475,5.955324333333333,0.5217593333333332,6.4314160000000005,6.5771827080000005,154297 -0.135.0,2024-09-24-16,luau_filter_no_globals_colidx,6.416223027666667,0.04007971906774632,6.434327875,5.896752666666667,0.515821,6.370285041,6.444056167,155860 -0.135.0,2024-09-24-16,luau_filter_no_globals_no_colidx,4.882403042,0.008304400477421212,4.878792625,4.366066999999999,0.5129766666666667,4.876514959,4.891901542,204834 -0.135.0,2024-09-24-16,luau_multi,23.563149166666665,0.12084128658263342,23.618674792,22.948927999999995,0.6057076666666666,23.42452475,23.646247958,42439 -0.135.0,2024-09-24-16,luau_multi_colidx,25.307053138666664,0.14190246076738108,25.229437541,24.68983366666667,0.6084596666666666,25.22088825,25.470833625,39515 -0.135.0,2024-09-24-16,luau_multi_no_globals,20.789784833,0.09330818084710885,20.771233208,20.18826333333333,0.5941096666666666,20.707146041,20.89097525,48100 -0.135.0,2024-09-24-16,luau_multi_no_globals_colidx,22.468777777666663,0.06336147206336232,22.434319,21.860624666666666,0.5994426666666667,22.430113333,22.541901,44506 -0.135.0,2024-09-24-16,luau_script,34.37493123633333,0.18424511590436754,34.450772584,33.71988733333333,0.6386066666666667,34.164870125,34.509151,29091 -0.135.0,2024-09-24-16,luau_script_colidx,35.95311194433334,0.09459903982334486,35.9000125,35.315021333333334,0.6261333333333333,35.89699175,36.062331583,27814 -0.135.0,2024-09-24-16,luau_script_no_globals,31.48654755533333,0.06536227423210579,31.483836875,30.858859666666664,0.6170133333333333,31.422582791,31.553223,31759 -0.135.0,2024-09-24-16,luau_script_no_globals_colidx,32.908628347333334,0.11577043469572552,32.853631709,32.300245,0.5985086666666667,32.830607375,33.041645958,30387 -0.135.0,2024-09-24-16,partition,2.6910777216666664,0.08134254580448368,2.683710458,1.0471736666666664,1.5704663333333333,2.613669416,2.775853291,371609 -0.135.0,2024-09-24-16,pseudo,1.749868542,0.014225692245463405,1.742604959,1.6951476666666665,0.051841,1.740741,1.766259667,571429 -0.135.0,2024-09-24-16,pseudo_formatstr,1.9699362636666669,0.01933357860842604,1.974440125,1.9101553333333332,0.056452,1.948748291,1.9866203750000002,507614 -0.135.0,2024-09-24-16,rename,1.0223717223333335,0.00132853788634319,1.022689459,0.9770796666666666,0.043116999999999996,1.020913125,1.023512583,978474 -0.135.0,2024-09-24-16,replace,2.2208249166666665,0.01846993240089755,2.227925875,2.173591,0.044233999999999996,2.199858333,2.234690542,450248 -0.135.0,2024-09-24-16,reverse,1.0730346803333333,0.0092353364340152,1.076827291,0.9547929999999999,0.11276966666666666,1.062506833,1.079769917,931966 -0.135.0,2024-09-24-16,reverse_index,6.212611430666667,0.020249710145143034,6.221831625,1.3274023333333334,4.874867333333333,6.189392417,6.22661025,160953 -0.135.0,2024-09-24-16,safenames,1.0288320416666668,0.007301467464161432,1.027015625,0.9827119999999999,0.04351766666666667,1.02261025,1.03687025,971817 -0.135.0,2024-09-24-16,sample_10,0.5373245556666667,0.006146533517872017,0.539554208,0.49454533333333334,0.039980666666666664,0.5303743750000001,0.5420450840000001,1862197 -0.135.0,2024-09-24-16,sample_1000,0.5382474306666668,0.007019726693921357,0.540284666,0.4948526666666666,0.04031033333333333,0.530434417,0.544023209,1858736 -0.135.0,2024-09-24-16,sample_100000,0.6926311803333333,0.017393950129422162,0.6905185420000001,0.6419503333333333,0.047856,0.676390041,0.7109849580000001,1443001 -0.135.0,2024-09-24-16,sample_100000_index,1.183129625,0.027482343506770583,1.170633,0.18858733333333333,0.9905156666666667,1.164116208,1.214639667,845309 -0.135.0,2024-09-24-16,sample_100000_seeded,0.6976854580000001,0.007627286306528944,0.6934967080000001,0.647064,0.04746999999999999,0.6930704160000001,0.7064892500000001,1432665 -0.135.0,2024-09-24-16,sample_100000_seeded_faster,0.7007501946666667,0.009656839819833469,0.703876542,0.6490819999999999,0.048329000000000004,0.6899175000000001,0.7084565420000001,1426534 -0.135.0,2024-09-24-16,sample_100000_seeded_index,1.1815479446666668,0.021342830279886673,1.187367542,0.18883399999999997,0.9894916666666665,1.157898917,1.199377375,846024 -0.135.0,2024-09-24-16,sample_100000_seeded_index_faster,1.1718582366666668,0.026610947115020527,1.159705834,0.18258200000000002,0.9856303333333333,1.153493042,1.202375834,853242 -0.135.0,2024-09-24-16,sample_100000_seeded_index_secure,1.1762253753333334,0.034821122653850185,1.162576167,0.18486533333333333,0.9872706666666665,1.150296584,1.2158033750000001,850340 -0.135.0,2024-09-24-16,sample_100000_seeded_secure,0.7022781386666668,0.002806519520520091,0.7018941660000001,0.6512106666666667,0.04800533333333334,0.699683375,0.705256875,1424501 -0.135.0,2024-09-24-16,sample_1000_index,0.03284229166666667,0.0009022042103234371,0.032380417,0.017162666666666663,0.013244,0.032264541,0.033881917000000004,30303030 -0.135.0,2024-09-24-16,sample_10_index,0.020079805333333332,0.0026556984030789974,0.018650541,0.015397333333333332,0.0025099999999999996,0.018444834,0.023144041,50000000 -0.135.0,2024-09-24-16,sample_25pct_index,2.9750148053333336,0.039682715233512765,2.9886605410000002,0.4587766666666666,2.5108603333333335,2.9303097080000002,3.006074167,336134 -0.135.0,2024-09-24-16,sample_25pct_seeded_index,2.929300958666667,0.016667684856128653,2.926092292,0.45078733333333326,2.474131666666666,2.914470875,2.947339709,341413 -0.135.0,2024-09-24-16,schema,8.195313541666666,0.0446462203828494,8.219507125,13.751384333333332,0.3872166666666666,8.143792292,8.222641208,122026 -0.135.0,2024-09-24-16,schema_index,0.12013554166666668,0.008694214500357247,0.12133300000000001,1.0265826666666664,0.06283733333333333,0.11090466700000001,0.128168958,8333333 -0.135.0,2024-09-24-16,search,0.6265067916666668,0.007650147927947007,0.6269241670000001,0.5816423333333334,0.04228166666666666,0.6186565,0.633939708,1594896 -0.135.0,2024-09-24-16,search_file,1.1424745836666668,0.004410180972715056,1.140925209,1.0975366666666666,0.04236499999999999,1.139048167,1.147450375,875657 -0.135.0,2024-09-24-16,search_file_case_sensitive,1.0551354586666666,0.013380968172748135,1.061939292,1.0095669999999999,0.042846666666666665,1.03971975,1.063747334,947867 -0.135.0,2024-09-24-16,search_file_case_sensitive_unicode,1.0535573473333333,0.0025878591646038974,1.054604084,1.008513,0.042692666666666663,1.050610083,1.055457875,948767 -0.135.0,2024-09-24-16,search_file_flag,1.352446694666667,0.006866526999651949,1.350410292,1.3065786666666666,0.043194333333333335,1.346828708,1.360101084,739645 -0.135.0,2024-09-24-16,search_file_flag_matchonly,0.865085611,0.001606240711946087,0.865100708,0.8207903333333334,0.04177466666666666,0.863471875,0.86668425,1156069 -0.135.0,2024-09-24-16,search_file_literal,0.971270611,0.007475221393073435,0.9687631250000001,0.9271039999999999,0.041806333333333334,0.9653715,0.979677208,1029866 -0.135.0,2024-09-24-16,search_file_unicode,1.1639032083333334,0.0028271832971763643,1.164281625,1.1180846666666664,0.043166333333333334,1.160905875,1.166522125,859107 -0.135.0,2024-09-24-16,search_unicode,0.6254133476666667,0.006438321057042762,0.6266624590000001,0.5813256666666666,0.041565,0.618442,0.631135584,1600000 -0.135.0,2024-09-24-16,searchset,1.2975941806666667,0.005470147761068308,1.299083,1.6017850000000002,0.09631633333333334,1.2915337500000001,1.302165792,770416 -0.135.0,2024-09-24-16,searchset_ignorecase,1.556388874666667,0.009854970109784755,1.553036041,1.8519333333333332,0.09590833333333333,1.548647791,1.567482792,642674 -0.135.0,2024-09-24-16,searchset_unicode,1.296759014,0.007013633688767448,1.298942917,1.600164666666667,0.09496533333333333,1.28891325,1.3024208750000001,771010 -0.135.0,2024-09-24-16,select,0.5093054026666667,0.00404490462352935,0.5078590000000001,0.46620966666666663,0.040752666666666666,0.506182542,0.513874666,1964637 -0.135.0,2024-09-24-16,select_regex,0.548204375,0.003994134017865046,0.546756208,0.504966,0.04085166666666667,0.5451363330000001,0.552720584,1824818 -0.135.0,2024-09-24-16,slice_last_1k,0.010606583000000001,0.000169965785201022,0.010644333,0.006411666666666666,0.0022793333333333333,0.010420916,0.0107545,90909091 -0.135.0,2024-09-24-16,slice_last_1k_index,0.007609944666666667,0.000055747023008707014,0.007592958,0.005165,0.0015609999999999999,0.007564667000000001,0.007672209,125000000 -0.135.0,2024-09-24-16,slice_last_1k_json,0.012858736333333334,0.00032299375177444715,0.012906875,0.008939,0.002187,0.012514375000000001,0.013154959,76923077 -0.135.0,2024-09-24-16,slice_last_1k_json_index,0.010644638666666666,0.00015579397958308104,0.010569208,0.008019,0.0017686666666666667,0.010540917,0.010823791000000001,90909091 -0.135.0,2024-09-24-16,slice_one_middle,0.26543452800000006,0.003765382528736338,0.26451687500000004,0.24240466666666668,0.020758333333333333,0.26221279200000003,0.269573917,3773585 -0.135.0,2024-09-24-16,slice_one_middle_index,0.008055680333333334,0.0003542027264848952,0.008111083,0.004650333333333334,0.0020806666666666664,0.007677041000000001,0.008378917000000001,125000000 -0.135.0,2024-09-24-16,snappy_compress,0.1642554856666667,0.011889780817104937,0.16231683300000002,1.184044,0.111804,0.153454166,0.17699545800000002,6097561 -0.135.0,2024-09-24-16,snappy_decompress,0.5102681533333334,0.005500153104407293,0.5076378340000001,0.48904499999999995,0.018954333333333333,0.506577042,0.516589584,1960784 -0.135.0,2024-09-24-16,snappy_validate,0.4934080836666667,0.0025609641932206524,0.49444341700000005,0.47320233333333334,0.017969333333333334,0.49049154200000006,0.49528929200000005,2028398 -0.135.0,2024-09-24-16,sort,1.362722180666667,0.015224198255254588,1.367705125,2.2367333333333335,0.13478166666666666,1.345630917,1.3748305,733676 -0.135.0,2024-09-24-16,sort_random_seeded,1.277235667,0.0220482709217895,1.274202042,1.152621,0.11864433333333334,1.256861292,1.3006436670000001,783085 -0.135.0,2024-09-24-16,sort_random_seeded_faster,1.2766626113333333,0.013032407405937125,1.275930084,1.1576186666666666,0.11349999999999999,1.264011917,1.290045833,783085 -0.135.0,2024-09-24-16,sort_random_seeded_secure,1.2543891943333332,0.026713117134247937,1.244067208,1.1390199999999997,0.110062,1.234377125,1.28472325,797448 -0.135.0,2024-09-24-16,sortcheck_sorted,0.5153332220000001,0.006185081540862334,0.514269416,0.4728136666666667,0.04011833333333333,0.509749042,0.521981208,1941748 -0.135.0,2024-09-24-16,sortcheck_unsorted,0.008389638666666666,0.0005737244630886276,0.008307833,0.004738333333333333,0.0019296666666666666,0.007861208,0.008999875000000001,125000000 -0.135.0,2024-09-24-16,sortcheck_unsorted_all,0.5501707916666668,0.005541494101793171,0.550404,0.5066083333333333,0.04098733333333333,0.544516375,0.5555920000000001,1818182 -0.135.0,2024-09-24-16,split,1.0493704996666666,0.007910423553617426,1.049418791,0.9026493333333333,0.115753,1.041436041,1.057256667,953289 -0.135.0,2024-09-24-16,split_chunks,1.120234292,0.030111930644881258,1.104892959,1.3082676666666666,0.16497799999999999,1.100882542,1.154927375,892857 -0.135.0,2024-09-24-16,split_chunks_index,0.18697277766666667,0.042858622505422356,0.17491662500000002,1.0691399999999998,0.20385699999999995,0.15143345800000002,0.23456825,5347594 -0.135.0,2024-09-24-16,split_chunks_index_j1,0.007465305666666667,0.0000796514095690299,0.00742775,0.004699333333333332,0.003094333333333333,0.007411375,0.0075567920000000005,142857143 -0.135.0,2024-09-24-16,split_index,0.23768683333333337,0.0725559356751515,0.27324037500000004,1.091119,0.199955,0.154211333,0.285608792,4201681 -0.135.0,2024-09-24-16,split_index_j1,1.2592708053333332,0.16255380151945975,1.183772458,0.9546813333333333,0.142186,1.148195458,1.4458445,794281 -0.135.0,2024-09-24-16,split_kbsize,2.2766981803333333,0.012927974824239937,2.269877791,2.1327936666666667,0.12341999999999999,2.268608625,2.2916081249999998,439174 -0.135.0,2024-09-24-16,sqlp,0.40779734700000003,0.007914565962686339,0.4053975,1.695825,0.174725,0.40136045800000003,0.41663408300000004,2450980 -0.135.0,2024-09-24-16,sqlp_aggregations,0.29816933333333334,0.009115512732610808,0.298473416,0.6971173333333333,0.07068100000000001,0.288905584,0.30712900000000004,3355705 -0.135.0,2024-09-24-16,sqlp_aggregations_expensive,0.16935347200000003,0.003527373938776974,0.16779433300000002,0.7539283333333332,0.08640199999999999,0.166874333,0.17339175,5917160 -0.135.0,2024-09-24-16,sqlp_aggregations_expensive_streaming,0.16805984733333335,0.004125716956913097,0.168813292,0.7404026666666667,0.08692166666666667,0.16360933300000002,0.171756917,5952381 -0.135.0,2024-09-24-16,sqlp_aggregations_expensive_vs_duckdb,0.220192611,0.006895560778436802,0.221206791,0.9563296666666665,0.11642399999999999,0.21284612500000002,0.22652491700000002,4545455 -0.135.0,2024-09-24-16,sqlp_aggregations_vs_duckdb,0.196525528,0.0035376552200915585,0.195820333,0.5983583333333332,0.10455166666666667,0.193393584,0.20036266700000002,5076142 -0.135.0,2024-09-24-16,sqlp_format_arrow,0.42199330533333335,0.004556207572483976,0.424045875,1.7110273333333332,0.17549099999999998,0.416771875,0.425162166,2369668 -0.135.0,2024-09-24-16,sqlp_format_avro,0.4102144026666667,0.005061229174795013,0.411516125,1.6861026666666665,0.166091,0.40462945800000005,0.41449762500000004,2439024 -0.135.0,2024-09-24-16,sqlp_format_json,0.4133611386666667,0.0030919226421735322,0.41396975,1.6944289999999997,0.166755,0.410010166,0.4161035,2421308 -0.135.0,2024-09-24-16,sqlp_format_jsonl,0.4222604446666667,0.0051319637476978965,0.42457575000000003,1.6993219999999998,0.17043533333333336,0.41637875,0.42582683400000004,2369668 -0.135.0,2024-09-24-16,sqlp_format_parquet,0.4298057916666667,0.006353062689149565,0.42835225000000005,1.7149919999999998,0.18642499999999998,0.42430545900000005,0.43675966600000005,2325581 -0.135.0,2024-09-24-16,sqlp_format_parquet_statistics,0.43114048600000004,0.007458680589017788,0.428229917,1.7372453333333333,0.18690700000000002,0.42557591600000005,0.439615625,2320186 -0.135.0,2024-09-24-16,sqlp_lowmemory,0.417616889,0.014694293519993164,0.418098417,1.7124036666666667,0.18015599999999998,0.40268775,0.4320645,2392344 -0.135.0,2024-09-24-16,sqlp_nooptimizations,0.6870286663333335,0.025997633800272996,0.677023083,1.9372059999999998,0.24553666666666665,0.667520375,0.716542541,1455604 -0.135.0,2024-09-24-16,sqlp_tryparsedates,11.698556333333334,0.06224271897816058,11.698906792,12.959810666666664,0.19394,11.636139125,11.760623083,85477 -0.135.0,2024-09-24-16,sqlp_tryparsedates_inferlen,1.3534028196666668,0.012706432993371718,1.3581646250000001,2.6290549999999997,0.19035133333333332,1.3390032920000001,1.363040542,739098 -0.135.0,2024-09-24-16,stats,1.9629378336666667,0.010769082619319158,1.967885834,2.2633656666666666,0.09611233333333331,1.950584,1.970343667,509424 -0.135.0,2024-09-24-16,stats_create_cache,1.9676440283333336,0.014059063612788077,1.968494667,2.2679753333333337,0.09736033333333334,1.9531789590000002,1.9812584590000002,508130 -0.135.0,2024-09-24-16,stats_everything,3.4835596803333337,0.022140485521481172,3.489869833,10.353195999999999,0.418762,3.458949125,3.501860083,287026 -0.135.0,2024-09-24-16,stats_everything_create_cache,3.483698680333333,0.02350966111101831,3.483431333,10.355561333333332,0.40347399999999994,3.460323833,3.507340875,287026 -0.135.0,2024-09-24-16,stats_everything_index,1.3361272086666667,0.010267401393492725,1.336258584,10.954704333333332,0.599267,1.32579475,1.346328292,748503 -0.135.0,2024-09-24-16,stats_everything_index_j1,6.854695902666666,0.2124807669045681,6.785950959,6.773230666666667,0.22746233333333332,6.685098583,7.093038166,145879 -0.135.0,2024-09-24-16,stats_everything_index_j1_with_cache,0.006949500333333334,0.0001236906413207295,0.006941584000000001,0.004315333333333333,0.0016300000000000002,0.0068299580000000006,0.007076959000000001,142857143 -0.135.0,2024-09-24-16,stats_everything_index_with_cache,0.006755764,0.000029931225183744385,0.0067475,0.0042829999999999995,0.0015563333333333332,0.006730833,0.006788959000000001,142857143 -0.135.0,2024-09-24-16,stats_everything_infer_dates,7.544474764,0.06610571676363132,7.539719792,14.65186433333333,0.43718199999999996,7.480874917,7.612829583,132556 -0.135.0,2024-09-24-16,stats_everything_infer_dates_index,1.9712553890000002,0.01578180308960967,1.974954,17.493133666666665,0.6190006666666666,1.95395275,1.984859417,507357 -0.135.0,2024-09-24-16,stats_everything_infer_dates_index_with_cache,0.009453278,0.0003059174776242768,0.009464208,0.005007333333333333,0.0025029999999999996,0.009142042000000001,0.009753584000000001,111111111 -0.135.0,2024-09-24-16,stats_everything_j1,6.932226486333334,0.2191861854228798,6.9921535,7.092407666666666,0.301378,6.689309584,7.115216375,144259 -0.135.0,2024-09-24-16,stats_everything_sorted,3.4060757083333333,0.019469531742491518,3.399919291,9.889667333333334,0.41274000000000005,3.390428625,3.427879209,293600 -0.135.0,2024-09-24-16,stats_everything_sorted_index,3.360670514,0.012779792428819386,3.366128125,9.836019333333335,0.39137433333333327,3.346068042,3.369815375,297530 -0.135.0,2024-09-24-16,stats_index,0.22936890266666668,0.0032174864896102368,0.227900125,2.309188,0.062957,0.227147917,0.23305866600000003,4366812 -0.135.0,2024-09-24-16,stats_index_j1,1.9043955553333334,0.002462638986340865,1.905520583,1.861112333333333,0.040918666666666666,1.9015713330000001,1.90609475,525210 -0.135.0,2024-09-24-16,stats_index_j1_with_cache,0.008856083333333334,0.00018000264550648503,0.00888825,0.0048,0.0022749999999999997,0.008662166,0.009017834,111111111 -0.135.0,2024-09-24-16,stats_index_with_cache,0.006949653,0.000141000668306927,0.006932875000000001,0.004216333333333333,0.0017366666666666665,0.0068177920000000005,0.007098292000000001,142857143 -0.135.0,2024-09-24-16,table,7.138138583333333,0.09404506235263703,7.18907575,5.9659043333333335,1.157986,7.029612625,7.195727375,140095 -0.135.0,2024-09-24-16,to_datapackage,0.7272858193333335,0.008316194758123806,0.7270111250000001,2.0777063333333334,0.2936213333333333,0.7191103750000001,0.735735958,1375516 -0.135.0,2024-09-24-16,to_sqlite,0.7271016666666666,0.01313184003378861,0.723635584,2.0855226666666664,0.2876823333333333,0.7160505410000001,0.741618875,1375516 -0.135.0,2024-09-24-16,to_xlsx,33.010126431,0.4606242418609583,32.940378042,30.806671333333338,3.463896333333333,32.588354084,33.501647167,30294 -0.135.0,2024-09-24-16,tojsonl,5.775423750000001,0.18043726685672368,5.7205741670000005,10.299069000000001,0.3912633333333333,5.628776,5.976921083,173160 -0.135.0,2024-09-24-16,tojsonl_batchall,5.769866222333334,0.04195440338697841,5.788551958,10.241785666666667,0.7365626666666666,5.721815417,5.799231292,173310 -0.135.0,2024-09-24-16,tojsonl_index,2.7748353886666663,0.1548881581347862,2.750104541,11.273431666666667,0.4748323333333333,2.633800583,2.940601042,360360 -0.135.0,2024-09-24-16,tojsonl_index_j1,8.915929097333333,0.04805372391867694,8.901217,8.700922666666665,0.20682966666666666,8.876951292,8.969619,112158 -0.135.0,2024-09-24-16,tojsonl_j1,8.904643583,0.21566559037967215,8.905772333,9.051070666666666,0.24466033333333334,8.688415833,9.119742583,112296 -0.135.0,2024-09-24-16,tojsonl_trim,5.712293930666667,0.17084569721042805,5.783862167,11.277515,0.3860696666666667,5.517303208,5.835716417,175070 -0.135.0,2024-09-24-16,tojsonl_trim_j1,9.752734736333334,0.0990284076154165,9.7563365,9.877368333333333,0.2611596666666667,9.651954584,9.849913125,102533 -0.135.0,2024-09-24-16,transpose,2.7220254166666664,0.004692695556098049,2.723108375,2.498336333333333,0.218445,2.716885917,2.726081958,367377 -0.135.0,2024-09-24-16,transpose_multipass,22.24792134733333,0.07624247920462365,22.22850475,20.51791133333333,1.7154086666666666,22.183264584,22.331994708,44948 -0.135.0,2024-09-24-16,validate,1.30840425,0.018368396880890668,1.297992333,5.896503,0.12280266666666667,1.297607334,1.329613083,764526 -0.135.0,2024-09-24-16,validate_batchall,1.4328333333333336,0.013285487941556798,1.4311021670000001,6.364915,0.2067393333333333,1.420498292,1.446899541,697837 -0.135.0,2024-09-24-16,validate_batchall_index,1.3639913746666668,0.008800722736897913,1.3681779170000001,5.896708666666666,0.14448799999999998,1.353878916,1.3699172910000001,733138 -0.135.0,2024-09-24-16,validate_dynenum,1.3199649583333333,0.01111617185067419,1.318487958,5.849461666666667,0.12200733333333331,1.309661125,1.331745792,757576 -0.135.0,2024-09-24-16,validate_dynenum_batchall,1.434016153,0.023234078165057275,1.4231925840000001,6.246928333333333,0.20278133333333334,1.418168583,1.460687292,697350 -0.135.0,2024-09-24-16,validate_dynenum_batchall_index,1.4026368336666668,0.030722734813760044,1.416122083,5.8318303333333334,0.15353833333333333,1.367477709,1.424310709,712758 -0.135.0,2024-09-24-16,validate_dynenum_index,1.321816389,0.010077301850209695,1.322062834,5.814260999999999,0.12134333333333332,1.311618125,1.331768208,756430 -0.135.0,2024-09-24-16,validate_dynenum_no_schema,1.3120186526666666,0.01575150808263098,1.320337958,5.820349,0.12308599999999999,1.293851792,1.321866208,762195 -0.135.0,2024-09-24-16,validate_dynenum_no_schema_index,1.3247700556666666,0.0077531056202698664,1.321226083,5.848779333333333,0.12089299999999999,1.319422292,1.333661792,754717 -0.135.0,2024-09-24-16,validate_dynenum_valid_output,2.259092778,0.008015024867107645,2.26258425,6.730052999999999,0.16668166666666664,2.24992425,2.264769834,442674 -0.135.0,2024-09-24-16,validate_dynenum_valid_output_index,2.2693469443333334,0.006661843234879,2.267653208,6.716018666666666,0.16425366666666666,2.263695459,2.276692166,440723 -0.135.0,2024-09-24-16,validate_index,1.3136580136666665,0.019416467389326893,1.304472958,5.853186333333333,0.11544333333333333,1.30053825,1.335962833,761035 -0.135.0,2024-09-24-16,validate_no_schema,0.5053290003333334,0.005870027604212966,0.503334042,0.4628716666666666,0.040303,0.5007164590000001,0.5119365,1980198 -0.135.0,2024-09-24-16,validate_no_schema_index,0.506884445,0.003765036220305587,0.5082592920000001,0.4649016666666666,0.039390666666666664,0.5026252090000001,0.5097688340000001,1972387 -0.135.0,2024-09-24-16,validate_valid_output,2.300180403,0.041454423239157906,2.281425958,6.857328,0.16683633333333334,2.2714174590000002,2.347697792,434783 -0.135.0,2024-09-24-16,validate_valid_output_index,2.2879162083333333,0.0226879417720758,2.297488083,6.789280333333333,0.16257466666666665,2.262010917,2.304249625,437063 +0.136.0,2024-10-09-22,apply_calcconv,1.7050352779999998,0.019588303727766737,1.706148542,3.551023666666667,0.20906266666666665,1.684914083,1.724043209,586510 +0.136.0,2024-10-09-22,apply_dynfmt,1.9835509580000001,0.01520292475588627,1.991420625,6.893937666666666,0.23447933333333335,1.966026416,1.9932058330000002,504032 +0.136.0,2024-10-09-22,apply_emptyreplace,1.5998238196666668,0.015927380642159632,1.592549459,2.684836333333333,0.22189199999999998,1.588832459,1.618089541,625000 +0.136.0,2024-10-09-22,apply_op_eudex,1.5777951106666668,0.007463856328587575,1.578263333,2.1339263333333336,0.25768166666666664,1.570108166,1.585013833,633714 +0.136.0,2024-10-09-22,apply_op_sentiment,3.9648692776666663,0.38017112229641026,3.760939625,24.910353999999998,0.294889,3.730174625,4.403493583,252207 +0.136.0,2024-10-09-22,apply_op_similarity,1.6037357640000003,0.011346707375146987,1.600556,2.227638333333333,0.2090173333333333,1.594318167,1.6163331250000001,623441 +0.136.0,2024-10-09-22,apply_op_similarity_batchall,1.4988207913333333,0.018303877114343062,1.492236125,1.7975236666666665,0.173787,1.4847202080000002,1.519506041,667111 +0.136.0,2024-10-09-22,apply_op_string,1.5737780416666667,0.016098774120262248,1.568187583,2.680716,0.19417966666666664,1.56121975,1.5919267920000002,635324 +0.136.0,2024-10-09-22,behead,0.8816215836666667,0.0029518552016063676,0.880686084,0.8362063333333333,0.04412866666666667,0.879250834,0.8849278330000001,1133787 +0.136.0,2024-10-09-22,behead_flexible,0.8807327083333334,0.0007861678845566681,0.8803495,0.8331896666666667,0.04612466666666667,0.880211625,0.881637,1135074 +0.136.0,2024-10-09-22,cat_columns,2.844363097,0.004815946161097188,2.846609875,2.747449,0.09545666666666665,2.8388343330000003,2.847645083,351617 +0.136.0,2024-10-09-22,cat_rows,1.733864903,0.01796410766456159,1.739907917,1.6407429999999998,0.09199366666666668,1.7136585000000002,1.748028292,576701 +0.136.0,2024-10-09-22,cat_rows_flexible,1.7142814723333337,0.005350188438718356,1.7144205000000001,1.6214606666666667,0.09174066666666665,1.7088631250000001,1.719560792,583431 +0.136.0,2024-10-09-22,cat_rowskey,3.1547543889999994,0.025507955662027824,3.144587958,3.05395,0.09971533333333334,3.135897292,3.183777917,316957 +0.136.0,2024-10-09-22,count,0.07404358300000001,0.0033655614659618086,0.074128917,0.4266313333333333,0.052357999999999995,0.070636166,0.077365666,13513514 +0.136.0,2024-10-09-22,count_flexible,0.462141944,0.0017590328841335147,0.462686458,0.417162,0.04387433333333333,0.460175041,0.46356433300000005,2164502 +0.136.0,2024-10-09-22,count_index,0.007620777666666666,0.000044225044650439403,0.007597292,0.0047279999999999996,0.0022673333333333334,0.00759325,0.007671791000000001,125000000 +0.136.0,2024-10-09-22,count_no_polars,0.46205180533333334,0.003785031735211785,0.460538875,0.4169433333333334,0.044038,0.45925725,0.466359291,2164502 +0.136.0,2024-10-09-22,count_polars_lowmem,0.057583291666666675,0.004786411221385849,0.056398750000000004,0.42018833333333333,0.053239,0.053500375,0.06285075000000001,17241379 +0.136.0,2024-10-09-22,count_width,0.4694743473333334,0.0008699056995136294,0.46932366700000006,0.4387203333333333,0.051019333333333326,0.46868962500000005,0.47040975,2132196 +0.136.0,2024-10-09-22,count_width_index,0.47969713900000005,0.0014514370775266074,0.479283334,0.4478786666666667,0.052171666666666665,0.478497541,0.481310542,2083333 +0.136.0,2024-10-09-22,datefmt,1.774153125,0.0035590442597932476,1.774050667,4.817035999999999,0.18269099999999996,1.770646416,1.777762292,563698 +0.136.0,2024-10-09-22,datefmt_formatstr_newcol,1.696654791666667,0.0033475541032160095,1.6982641250000001,4.038033333333334,0.18209666666666666,1.6928065,1.69889375,589275 +0.136.0,2024-10-09-22,datefmt_multi,2.127302097333333,0.004210234557934205,2.129608458,8.931267,0.183364,2.122442625,2.129855209,470146 +0.136.0,2024-10-09-22,datefmt_multi_batchall,2.076182375,0.00452843165628292,2.0749605,8.506684666666667,0.16247333333333333,2.07239025,2.081196375,481696 +0.136.0,2024-10-09-22,datefmt_multi_select,2.3498884996666667,0.010352363040544177,2.349666833,11.608551999999998,0.19084,2.33964875,2.360349916,425532 +0.136.0,2024-10-09-22,dedup,1.2833504163333334,0.006031065668196462,1.285843375,2.0846046666666664,0.13363299999999997,1.276472541,1.287735333,779423 +0.136.0,2024-10-09-22,dedup_sorted,0.988590222,0.0025965061259993253,0.9875626660000001,0.9281836666666666,0.106696,0.9866647500000001,0.9915432500000001,1011122 +0.136.0,2024-10-09-22,diff,2.644915194333333,0.02362920456233022,2.636728375,4.132400666666666,0.1896433333333333,2.626468167,2.671549041,378072 +0.136.0,2024-10-09-22,enum,0.8937754166666667,0.004067882760879962,0.891814875,0.845328,0.047180666666666655,0.891059084,0.898452291,1118568 +0.136.0,2024-10-09-22,enum_constant,0.8840505833333334,0.008684506612208931,0.8846767080000001,0.8348966666666665,0.048019,0.8750699590000001,0.8924050830000001,1131222 +0.136.0,2024-10-09-22,enum_copy,0.893037556,0.01224491038664033,0.890319084,0.8450743333333334,0.04683933333333334,0.882380334,0.9064132500000001,1119821 +0.136.0,2024-10-09-22,enum_hash,1.5074435973333333,0.0061144736970913036,1.5084482920000002,1.4567166666666667,0.04959233333333333,1.500889,1.5129934999999999,663570 +0.136.0,2024-10-09-22,enum_uuid,1.6966366806666666,0.0021752266734352942,1.697039709,0.9602846666666666,0.7351523333333333,1.694288125,1.698582208,589275 +0.136.0,2024-10-09-22,enum_uuid7,1.7300057640000002,0.002183014410172075,1.730594666,0.9914836666666665,0.7373496666666667,1.727588709,1.731833917,578035 +0.136.0,2024-10-09-22,excel,12.393773708333333,0.025221651975008946,12.379599833,12.269890333333331,0.921428,12.378827542,12.42289375,80684 +0.136.0,2024-10-09-22,excel_error_format_formula,21.884125819666668,0.00715533626895283,21.885303042,21.684978333333333,1.0060536666666666,21.876454875,21.890619542,45695 +0.136.0,2024-10-09-22,excel_j1,12.915160528000001,0.04663747179989207,12.901054709,12.124392666666667,0.7840539999999999,12.877204292,12.967222583,77429 +0.136.0,2024-10-09-22,excel_metadata,12.197152514,0.05778919332351246,12.222285042,11.175934333333332,1.0180623333333332,12.131052542,12.238119958,81987 +0.136.0,2024-10-09-22,excel_metadata_short,0.949276556,0.005821559078994813,0.9460756250000001,0.31018933333333326,0.6379016666666666,0.945757834,0.955996209,1053741 +0.136.0,2024-10-09-22,excel_trim,12.549495555666667,0.03768325174515143,12.528584125,13.758439333333333,0.8868153333333333,12.526904917,12.592997625,79688 +0.136.0,2024-10-09-22,excel_trim_j1,14.098593306,0.030775290509148033,14.111557792,13.282066333333333,0.8124296666666666,14.063456917,14.120765209,70927 +0.136.0,2024-10-09-22,exclude,0.5592902643333334,0.000774786047916678,0.559157667,0.5167163333333332,0.041481333333333335,0.558590334,0.560122792,1788909 +0.136.0,2024-10-09-22,exclude_casei,0.5643613056666666,0.0031860825198504364,0.565165125,0.5201456666666666,0.04312633333333333,0.560850292,0.5670685000000001,1773050 +0.136.0,2024-10-09-22,exclude_casei_index,0.5783586943333333,0.003891602650041549,0.577528708,0.533566,0.043393666666666664,0.5749490420000001,0.5825983330000001,1730104 +0.136.0,2024-10-09-22,exclude_index,0.5735247776666667,0.000401239904309557,0.573679083,0.529978,0.04223166666666667,0.573069292,0.573825958,1742160 +0.136.0,2024-10-09-22,exclude_multi,0.8818913613333333,0.010612203719753337,0.877010917,0.8407443333333333,0.04008899999999999,0.874597375,0.894065792,1133787 +0.136.0,2024-10-09-22,exclude_multi_casei,0.889859625,0.007512512836473645,0.8877661250000001,0.849367,0.039474666666666665,0.883615916,0.898196834,1123596 +0.136.0,2024-10-09-22,exclude_multi_casei_index,0.9129257776666667,0.0040803320905515895,0.912865667,0.867269,0.04425866666666667,0.9088758330000001,0.917035833,1095290 +0.136.0,2024-10-09-22,exclude_multi_index,0.9042534306666669,0.0024943746574162032,0.9048569580000001,0.858998,0.043971333333333334,0.9015126670000001,0.906390667,1106195 +0.136.0,2024-10-09-22,explode,1.5868397643333332,0.012723681110287766,1.593969042,1.54249,0.043229333333333335,1.5721498340000002,1.5944004170000001,630120 +0.136.0,2024-10-09-22,extdedup,1.055031389,0.01862670088469005,1.04563,0.9730626666666665,0.07952733333333334,1.042979042,1.076485125,947867 +0.136.0,2024-10-09-22,extsort,0.7629452496666668,0.01000641117909401,0.757770833,1.2839026666666664,0.2641253333333333,0.7565855410000001,0.7744793750000001,1310616 +0.136.0,2024-10-09-22,fill,2.0876410416666666,0.017161149403491567,2.096520375,2.0344270000000004,0.05202266666666666,2.0678595,2.09854325,478927 +0.136.0,2024-10-09-22,fixlengths,1.3367088053333334,0.0037192935909075913,1.336161416,1.2477593333333334,0.08782333333333332,1.333293541,1.340671459,747943 +0.136.0,2024-10-09-22,flatten,5.603500527666665,0.021892915561610844,5.603754292,5.533522333333333,0.06872466666666666,5.581481833,5.625265458,178444 +0.136.0,2024-10-09-22,flatten_condensed,5.987705097,0.014473185810507367,5.995395666,5.915737333333333,0.07072133333333332,5.971010125,5.9967095,167001 +0.136.0,2024-10-09-22,fmt,1.002943764,0.015636441586786093,1.011603417,0.9553783333333333,0.046455666666666666,0.9848932920000001,1.012334583,997009 +0.136.0,2024-10-09-22,fmt_no_crlf,0.9969341666666667,0.010619823384458773,0.9984205420000001,0.9479203333333333,0.047823,0.985649458,1.0067325,1003009 +0.136.0,2024-10-09-22,fmt_no_final_newline,1.0060015279999999,0.002756928572519994,1.007205584,0.9571203333333332,0.04771533333333333,1.002847375,1.007951625,994036 +0.136.0,2024-10-09-22,foreach,0.007608791333333334,0.00019723007370158691,0.0076508750000000006,0.0046619999999999995,0.002355,0.007393916,0.007781583000000001,125000000 +0.136.0,2024-10-09-22,frequency,2.8425398056666666,0.015544831875395617,2.847457958,3.763486333333333,0.15672966666666666,2.825130792,2.8550306670000003,351741 +0.136.0,2024-10-09-22,frequency_ignorecase,3.5406878470000005,0.01270137750083542,3.54438075,4.4127849999999995,0.15055033333333334,3.52654925,3.551133541,282406 +0.136.0,2024-10-09-22,frequency_ignorecase_index,1.1237090693333334,0.05675912448248234,1.151073708,5.658496333333333,0.26584800000000003,1.05845175,1.16160175,889680 +0.136.0,2024-10-09-22,frequency_index,1.0480152363333333,0.05581236833496712,1.06061575,4.836763333333333,0.27762166666666666,0.9869797920000001,1.096450167,954198 +0.136.0,2024-10-09-22,frequency_index_stats_mode_auto,1.0070706113333332,0.008835466506475256,1.010171625,4.776612666666666,0.28615066666666666,0.997102667,1.013937542,993049 +0.136.0,2024-10-09-22,frequency_index_stats_mode_force,1.001722,0.017089296086828448,1.000352042,4.752778999999999,0.2881666666666666,0.985358916,1.019455042,998004 +0.136.0,2024-10-09-22,frequency_index_stats_mode_none,1.1178535139999999,0.008691833816215843,1.11994925,5.200261,0.2872523333333333,1.108305417,1.125305875,894454 +0.136.0,2024-10-09-22,frequency_j1,3.3876877083333334,0.029824394521667963,3.371393833,3.2935416666666666,0.09082966666666666,3.369559625,3.422109667,295159 +0.136.0,2024-10-09-22,frequency_j1_ignorecase,4.097180166666667,0.04444788013603125,4.074708084,3.998699333333333,0.09522966666666666,4.068455375,4.148377041,244081 +0.136.0,2024-10-09-22,frequency_limit20,2.8631531113333337,0.01734257567212987,2.860227167,3.7674983333333336,0.156258,2.847459625,2.881772542,349284 +0.136.0,2024-10-09-22,frequency_limit20_index,1.0633833053333335,0.05690465310295847,1.081311333,4.816441333333334,0.2702363333333333,0.9996737080000001,1.109164875,940734 +0.136.0,2024-10-09-22,frequency_no_limit,4.727112305666668,0.061715317730259046,4.698074208,5.593491,0.180478,4.685272042,4.7979906670000005,211551 +0.136.0,2024-10-09-22,frequency_no_limit_index,2.6022785136666666,0.08396557520664231,2.557269,6.371206666666666,0.30891966666666665,2.55041375,2.699152791,384320 +0.136.0,2024-10-09-22,frequency_notrim,2.4027974446666667,0.008777820223043718,2.398003917,3.2956936666666667,0.15378166666666665,2.397460083,2.412928334,416146 +0.136.0,2024-10-09-22,frequency_notrim_index,0.9821368890000001,0.04746079884980989,0.959449583,4.254895333333334,0.2751093333333333,0.950277625,1.036683459,1018330 +0.136.0,2024-10-09-22,frequency_other_sorted,2.8385583476666665,0.009154976898636062,2.842162375,3.706337,0.15396933333333332,2.828149834,2.845362834,352237 +0.136.0,2024-10-09-22,frequency_other_sorted_index,1.0439775693333333,0.04408596032803923,1.0570245,4.785868333333333,0.278931,0.9948406660000001,1.080067542,957854 +0.136.0,2024-10-09-22,frequency_selregex,0.8031119166666666,0.005922772057214628,0.8064003750000001,0.8800663333333333,0.05293966666666666,0.796274542,0.806660833,1245330 +0.136.0,2024-10-09-22,frequency_selregex_ignorecase,1.0019362636666667,0.003958131683469569,1.000608625,1.0783873333333334,0.05544966666666667,0.998812625,1.006387541,998004 +0.136.0,2024-10-09-22,frequency_sorted,2.670419513666667,0.020978018705839457,2.672767083,3.504291,0.16317866666666667,2.648366458,2.690125,374532 +0.136.0,2024-10-09-22,frequency_sorted_index,2.759115,0.006533081122469273,2.758656042,3.5680853333333338,0.14549066666666666,2.7528235,2.765865458,362450 +0.136.0,2024-10-09-22,geocode_reverse,3.635352513666667,0.02168954742554722,3.638010625,2.964048333333333,10.649058666666667,3.612456416,3.6555905,275103 +0.136.0,2024-10-09-22,geocode_reverse_batchall,3.705906264,0.0007124361086981811,3.706016417,3.307206666666666,10.218860333333334,3.705145167,3.706557208,269833 +0.136.0,2024-10-09-22,geocode_suggest,4.424326458333333,0.01332076052122252,4.41954975,21.383587000000002,2.6517473333333332,4.414052666,4.439376959,226040 +0.136.0,2024-10-09-22,geocode_suggest_batchall,4.436151958,0.011998674399309246,4.440844291,22.09025433333333,1.42405,4.422516208,4.445095375,225428 +0.136.0,2024-10-09-22,index,0.47402747233333337,0.00010761350663523526,0.474064792,0.4296609999999999,0.04324733333333333,0.47390616700000004,0.47411145800000004,2109705 +0.136.0,2024-10-09-22,input,1.6518609443333334,0.005086299425495461,1.6505388330000001,1.5999656666666666,0.05077399999999999,1.6475662500000001,1.65747775,605327 +0.136.0,2024-10-09-22,join,1.8915908750000001,0.001276399396811533,1.8912137910000002,1.351193,0.5393533333333332,1.8905455,1.893013334,528541 +0.136.0,2024-10-09-22,join_casei,1.8990923469999998,0.005914988734221181,1.899654833,1.35911,0.5389416666666667,1.892916208,1.904706,526593 +0.136.0,2024-10-09-22,joinp,0.791479514,0.001933987035821872,0.790972792,1.4077596666666665,0.08986899999999999,0.789849333,0.7936164170000001,1264223 +0.136.0,2024-10-09-22,joinp_streaming,0.025671264000000003,0.00013254458292589571,0.025618625000000003,0.034173,0.010442333333333333,0.025573125000000002,0.025822042,38461538 +0.136.0,2024-10-09-22,json,18.515705639,0.053356714059158934,18.505266583,16.98018433333333,1.5260143333333331,18.468339917,18.573510417,54007 +0.136.0,2024-10-09-22,jsonl,1.4464047779999998,0.0027990031779148127,1.445855125,7.01248,0.19835666666666665,1.443921375,1.449437834,691563 +0.136.0,2024-10-09-22,jsonl_batchall,0.007816459,0.0001457840333335582,0.007748334000000001,0.004854,0.0023936666666666663,0.007717209,0.007983834,125000000 +0.136.0,2024-10-09-22,jsonl_j1,5.819719833333333,0.011475245824118704,5.819589125,5.685633,0.13232933333333333,5.8083105,5.831259875,171821 +0.136.0,2024-10-09-22,luau_filter,7.428935666666667,0.009451951106225287,7.424287958,6.889022999999999,0.5387573333333333,7.422707417,7.439811625,134608 +0.136.0,2024-10-09-22,luau_filter_colidx,9.097977945,0.0250814889252206,9.091565834,8.539466666666666,0.5573443333333333,9.076724959,9.125643042,109914 +0.136.0,2024-10-09-22,luau_filter_no_globals,4.837267319666666,0.006909354815523144,4.840564667,4.306157333333333,0.5299393333333332,4.829327,4.841910292,206740 +0.136.0,2024-10-09-22,luau_filter_no_globals_colidx,6.395455472666666,0.018600087946171094,6.404931084,5.861872000000001,0.5323926666666666,6.374025667,6.407409667,156372 +0.136.0,2024-10-09-22,luau_filter_no_globals_colidx,6.387959667,0.010597093263644972,6.393553959,5.850938333333333,0.535815,6.37573775,6.394587292,156544 +0.136.0,2024-10-09-22,luau_filter_no_globals_no_colidx,4.827248333333333,0.010409160348532303,4.8219105,4.298935666666666,0.5270643333333334,4.820590875,4.839243625,207168 +0.136.0,2024-10-09-22,luau_multi,23.364569847333332,0.06760305298498792,23.363684875,22.726609666666665,0.6363066666666666,23.297413625,23.432611042,42799 +0.136.0,2024-10-09-22,luau_multi_colidx,24.95775619433333,0.026643842878957484,24.947410541,24.30202733333333,0.6541143333333334,24.937836792,24.98802125,40067 +0.136.0,2024-10-09-22,luau_multi_no_globals,20.82279369433333,0.2973751150338541,20.659224458,20.181841666666667,0.6393093333333333,20.643109791,21.166046834,48024 +0.136.0,2024-10-09-22,luau_multi_no_globals_colidx,22.24774802766667,0.07362856474543945,22.240344875,21.590948666666666,0.655243,22.178100708,22.3247985,44948 +0.136.0,2024-10-09-22,luau_script,33.837904138999995,0.050418620727490406,33.840241834,33.180931333333334,0.6551846666666666,33.786357333,33.88711325,29553 +0.136.0,2024-10-09-22,luau_script_colidx,35.36256690233333,0.042258863655929034,35.379295666,34.698109666666674,0.6626286666666666,35.314504625,35.393900416,28278 +0.136.0,2024-10-09-22,luau_script_no_globals,31.057191791666668,0.03675492282724094,31.065451625,30.40201233333333,0.6533836666666666,31.01700975,31.089114,32199 +0.136.0,2024-10-09-22,luau_script_no_globals_colidx,32.38958259733334,0.1005595308660732,32.3332995,31.733821666666667,0.6539403333333333,32.329767458,32.505680834,30874 +0.136.0,2024-10-09-22,partition,2.3267367083333332,0.01474690479917634,2.3239864580000003,0.9367673333333334,1.3608143333333331,2.313558542,2.342665125,429738 +0.136.0,2024-10-09-22,pseudo,1.7149044583333335,0.0013415217707730286,1.7146115000000002,1.6574523333333333,0.055862333333333326,1.7137336250000001,1.71636825,583090 +0.136.0,2024-10-09-22,pseudo_formatstr,1.9270131106666668,0.010054385149304729,1.9285091250000002,1.8668756666666664,0.058633333333333336,1.916294541,1.936235666,518941 +0.136.0,2024-10-09-22,rename,0.9871046526666668,0.0006993774664674101,0.986705166,0.9393923333333333,0.0466,0.9866965830000001,0.9879122090000001,1013171 +0.136.0,2024-10-09-22,replace,2.0904712636666667,0.005182647935350693,2.0914495,2.038176333333333,0.051205333333333325,2.084869208,2.095095083,478469 +0.136.0,2024-10-09-22,reverse,0.9752164030000001,0.003911500340501181,0.974709959,0.8659096666666667,0.10359199999999998,0.9715827920000001,0.979356458,1025641 +0.136.0,2024-10-09-22,reverse_index,6.117939041333334,0.029647164872719674,6.102424291,1.2457633333333333,4.859567999999999,6.099268708,6.152124125,163452 +0.136.0,2024-10-09-22,safenames,0.9873110416666667,0.0017477939056016701,0.9865907500000001,0.9395263333333334,0.04666033333333333,0.9860385,0.989303875,1013171 +0.136.0,2024-10-09-22,sample_10,0.5176511113333334,0.0026381009682744073,0.5161513750000001,0.4723566666666666,0.044233999999999996,0.51610475,0.520697209,1930502 +0.136.0,2024-10-09-22,sample_1000,0.5139957086666668,0.0013784006206823084,0.513462834,0.47076999999999997,0.042213,0.512963292,0.515561,1945525 +0.136.0,2024-10-09-22,sample_100000,0.6506862360000002,0.004323453892143529,0.6495751670000001,0.6026693333333334,0.046593666666666665,0.6470267500000001,0.6554567910000001,1536098 +0.136.0,2024-10-09-22,sample_100000_index,1.1417670003333333,0.002279856408883079,1.143076959,0.18067633333333333,0.9598353333333334,1.139134459,1.143089583,875657 +0.136.0,2024-10-09-22,sample_100000_seeded,0.6475213336666666,0.0007607925677195599,0.6478826670000001,0.5994356666666666,0.046645,0.6466472090000001,0.648034125,1543210 +0.136.0,2024-10-09-22,sample_100000_seeded_faster,0.6370564166666667,0.00255118044164526,0.63679675,0.5895136666666666,0.04610966666666666,0.634645,0.6397275,1569859 +0.136.0,2024-10-09-22,sample_100000_seeded_index,1.1341891393333334,0.008487503092167076,1.132666459,0.178262,0.9546233333333333,1.126566042,1.143334917,881834 +0.136.0,2024-10-09-22,sample_100000_seeded_index_faster,1.11916925,0.004465985590402288,1.117995834,0.17152766666666666,0.946349,1.115407125,1.124104791,893655 +0.136.0,2024-10-09-22,sample_100000_seeded_index_secure,1.129794125,0.013567921456163637,1.125208125,0.17367066666666664,0.9544003333333334,1.1191135,1.14506075,884956 +0.136.0,2024-10-09-22,sample_100000_seeded_secure,0.6438970140000001,0.005995088218259433,0.64145975,0.596007,0.04657666666666666,0.639504417,0.650726875,1552795 +0.136.0,2024-10-09-22,sample_1000_index,0.03282537466666667,0.00026952162709573834,0.03287625,0.017051,0.014543333333333333,0.032534041,0.033065833,30303030 +0.136.0,2024-10-09-22,sample_10_index,0.019179167,0.00016445632517176142,0.019233167000000002,0.015091666666666665,0.003276,0.0189945,0.019309834,52631579 +0.136.0,2024-10-09-22,sample_25pct_index,2.784334319333334,0.010606911422785317,2.781489958,0.41741633333333333,2.3654466666666667,2.775439583,2.796073417,359195 +0.136.0,2024-10-09-22,sample_25pct_seeded_index,2.78099625,0.00379345691057489,2.78093475,0.41989866666666664,2.3597406666666667,2.777233917,2.784820083,359583 +0.136.0,2024-10-09-22,schema,7.8167974026666664,0.040637817784430603,7.831071542,13.272739999999999,0.393381,7.770948333,7.848372333,127926 +0.136.0,2024-10-09-22,schema_index,0.11072668033333334,0.000484563093606325,0.11074679100000001,1.0238993333333333,0.062112666666666656,0.11023237500000001,0.111200875,9009009 +0.136.0,2024-10-09-22,search,0.5870271663333334,0.0010020523424938018,0.5864650410000001,0.5419103333333334,0.043955333333333325,0.586432375,0.588184083,1703578 +0.136.0,2024-10-09-22,search_file,1.0692094583333331,0.0021090949990984855,1.06957,1.021632,0.046468333333333334,1.066943333,1.071115042,935454 +0.136.0,2024-10-09-22,search_file_case_sensitive,0.9843420133333334,0.0007680726995515422,0.984066541,0.9367206666666666,0.046418999999999995,0.983749666,0.9852098330000001,1016260 +0.136.0,2024-10-09-22,search_file_case_sensitive_unicode,0.9835609446666668,0.0015630574535778763,0.9834135,0.9346326666666668,0.047799,0.982076834,0.9851925,1016260 +0.136.0,2024-10-09-22,search_file_flag,1.2531727916666668,0.0008596545780139056,1.253225583,1.203385,0.04862666666666667,1.2522879580000001,1.254004834,798085 +0.136.0,2024-10-09-22,search_file_flag_matchonly,0.8408497920000001,0.0026936119251155933,0.840080292,0.7939083333333333,0.04576266666666667,0.838624667,0.843844417,1189061 +0.136.0,2024-10-09-22,search_file_literal,0.9418367220000001,0.004136733581775356,0.9425460000000001,0.8937806666666667,0.046918999999999995,0.9373912080000001,0.9455729580000001,1061571 +0.136.0,2024-10-09-22,search_file_unicode,1.0884298053333334,0.0018019277360431514,1.0882925,1.0402263333333333,0.047083999999999994,1.086700458,1.090296458,919118 +0.136.0,2024-10-09-22,search_unicode,0.5862996943333334,0.0010450075642292427,0.585999,0.5415249999999999,0.04366233333333333,0.585438,0.587462083,1706485 +0.136.0,2024-10-09-22,searchset,1.2479794026666666,0.0033260528682583397,1.24748775,1.5688736666666667,0.10403333333333332,1.244926542,1.251523916,801282 +0.136.0,2024-10-09-22,searchset_ignorecase,1.4406021803333333,0.001061116164262995,1.4411695,1.7601093333333333,0.10427633333333335,1.439378,1.441259041,693963 +0.136.0,2024-10-09-22,searchset_unicode,1.2449286526666665,0.0048582164261754095,1.243868708,1.5592396666666666,0.10194599999999998,1.240687917,1.250229333,803213 +0.136.0,2024-10-09-22,select,0.4911789306666667,0.000493972853482189,0.491139166,0.4470453333333333,0.04304333333333333,0.490706042,0.491691584,2036660 +0.136.0,2024-10-09-22,select_regex,0.526533139,0.0018390845593602076,0.526113458,0.4818723333333333,0.04352033333333333,0.524940167,0.5285457920000001,1897533 +0.136.0,2024-10-09-22,slice_last_1k,0.008980902666666667,0.00016669415587336408,0.009022959,0.006043666666666666,0.0023713333333333334,0.008797208,0.009122541000000001,111111111 +0.136.0,2024-10-09-22,slice_last_1k_index,0.008859264,0.00013893256714320095,0.008839125,0.005673333333333333,0.0024966666666666666,0.0087315,0.009007167,111111111 +0.136.0,2024-10-09-22,slice_last_1k_json,0.011867916333333334,0.00021738081900741208,0.01192925,0.008818666666666667,0.002439,0.011626458000000001,0.012048041,83333333 +0.136.0,2024-10-09-22,slice_last_1k_json_index,0.012107569,0.000030551562365941323,0.012092708,0.008600666666666666,0.0027273333333333333,0.012087291,0.012142708,83333333 +0.136.0,2024-10-09-22,slice_one_middle,0.2571123886666667,0.0011219957566971101,0.256721875,0.23244233333333333,0.023569666666666666,0.256237833,0.258377458,3891051 +0.136.0,2024-10-09-22,slice_one_middle_index,0.007865847,0.00005689357381110821,0.007883708,0.004748666666666666,0.002436,0.007802166,0.007911667,125000000 +0.136.0,2024-10-09-22,snappy_compress,0.17543269433333333,0.006437975025409485,0.178878375,1.162416,0.10746833333333333,0.16800520800000002,0.1794145,5714286 +0.136.0,2024-10-09-22,snappy_decompress,0.4950796386666667,0.0023225127598840655,0.49471354100000003,0.47328966666666666,0.020706000000000002,0.492961917,0.497563458,2020202 +0.136.0,2024-10-09-22,snappy_validate,0.4778566806666667,0.0004231111302557911,0.47797025000000004,0.45737266666666665,0.019444,0.47738837500000003,0.478211417,2092050 +0.136.0,2024-10-09-22,sort,1.2448715416666667,0.004062296792098928,1.243819625,2.116125,0.12835066666666664,1.241438667,1.249356333,803213 +0.136.0,2024-10-09-22,sort_random_seeded,1.1755278606666666,0.01747898854165516,1.172330541,1.0638263333333333,0.106891,1.15986825,1.194384791,850340 +0.136.0,2024-10-09-22,sort_random_seeded_faster,1.1654890973333334,0.01166500440181754,1.1681225,1.0480146666666668,0.113234,1.1527325,1.175612292,858369 +0.136.0,2024-10-09-22,sort_random_seeded_secure,1.1753826666666667,0.010507493503398389,1.169346375,1.0649463333333333,0.10569433333333333,1.169286,1.187515625,851064 +0.136.0,2024-10-09-22,sortcheck_sorted,0.4984155003333333,0.0002929748303563367,0.49856108400000004,0.4543566666666667,0.04294599999999999,0.49807825000000006,0.498607167,2008032 +0.136.0,2024-10-09-22,sortcheck_unsorted,0.007578639000000001,0.00018884501208133643,0.007486417,0.004672,0.0023496666666666666,0.007453625,0.007795875000000001,125000000 +0.136.0,2024-10-09-22,sortcheck_unsorted_all,0.5326300416666666,0.002544201329297766,0.532960416,0.48673799999999995,0.044769,0.529936792,0.5349929170000001,1876173 +0.136.0,2024-10-09-22,split,0.9717815833333333,0.04966093558043338,0.9652262500000001,0.816767,0.10735266666666665,0.9257238750000001,1.024394625,1028807 +0.136.0,2024-10-09-22,split_chunks,1.0157952636666667,0.02866098694382442,1.024791417,1.2385943333333334,0.156238,0.983715416,1.038878958,984252 +0.136.0,2024-10-09-22,split_chunks_index,0.237130722,0.11520503998050603,0.203406208,1.0650579999999998,0.205462,0.142551541,0.365434417,4219409 +0.136.0,2024-10-09-22,split_chunks_index_j1,0.009002514000000001,0.00007689415139657865,0.009030916,0.005336666666666667,0.004367666666666666,0.008915459,0.009061167,111111111 +0.136.0,2024-10-09-22,split_index,0.20669505566666668,0.05728089425847374,0.23947558300000002,1.0443603333333333,0.19973866666666665,0.14055362500000002,0.240055959,4830918 +0.136.0,2024-10-09-22,split_index_j1,1.0268533613333333,0.03843516155693625,1.010459542,0.8795676666666666,0.11235266666666666,0.9993334170000001,1.070767125,973710 +0.136.0,2024-10-09-22,split_kbsize,2.036766389,0.04346158710737812,2.018941292,1.8980946666666665,0.11043033333333334,2.00505125,2.086306625,490918 +0.136.0,2024-10-09-22,sqlp,1.0465692636666668,0.013914078754769328,1.045776375,2.0121113333333334,0.27294066666666666,1.033068583,1.060862833,955110 +0.136.0,2024-10-09-22,sqlp_aggregations,0.9915284443333334,0.011169118131619549,0.9854105420000001,1.3752786666666665,0.05673066666666667,0.9847549160000001,1.004419875,1008065 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive,0.8579473750000001,0.01071784359307729,0.854041916,1.4292443333333333,0.096925,0.849729917,0.8700702920000001,1165501 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive_streaming,0.8589640693333335,0.007709582229175652,0.856374708,1.4293953333333331,0.09164933333333332,0.8528825000000001,0.867635,1164144 +0.136.0,2024-10-09-22,sqlp_aggregations_expensive_vs_duckdb,0.22032431933333338,0.004612126504017463,0.21811233300000002,0.8804273333333333,0.12304033333333332,0.21723483300000002,0.22562579200000002,4545455 +0.136.0,2024-10-09-22,sqlp_aggregations_vs_duckdb,0.19413722266666666,0.004535591784381205,0.195525792,0.6153073333333333,0.10211199999999998,0.18906966700000002,0.19781620900000002,5154639 +0.136.0,2024-10-09-22,sqlp_format_arrow,1.7363877083333332,0.0031499168238577568,1.735523583,2.212789666666666,0.7271723333333333,1.7337600420000001,1.7398795,576037 +0.136.0,2024-10-09-22,sqlp_format_avro,1.0304819856666667,0.005013535079717769,1.029383666,1.9862596666666665,0.3737723333333333,1.026108666,1.035953625,970874 +0.136.0,2024-10-09-22,sqlp_format_json,1.0228909720000001,0.0014389669238213615,1.023682666,1.964939,0.2585943333333333,1.02123,1.02376025,977517 +0.136.0,2024-10-09-22,sqlp_format_jsonl,1.023724417,0.001900603307837816,1.022800209,1.9640849999999999,0.25551566666666664,1.022462667,1.025910375,976562 +0.136.0,2024-10-09-22,sqlp_format_parquet,1.025136014,0.0019723300715826465,1.024200291,1.9758483333333334,0.270604,1.023805709,1.027402042,975610 +0.136.0,2024-10-09-22,sqlp_format_parquet_statistics,1.025225514,0.0020774838027392698,1.025213792,1.9743553333333332,0.2804173333333333,1.023153916,1.027308834,975610 +0.136.0,2024-10-09-22,sqlp_lowmemory,1.0151198193333333,0.001987065600690942,1.015369,1.963368,0.2535543333333333,1.013019916,1.016970542,985222 +0.136.0,2024-10-09-22,sqlp_nooptimizations,1.0478791386666666,0.0008166565087663174,1.047478,2.0146046666666666,0.18714033333333335,1.047340625,1.048818791,954198 +0.136.0,2024-10-09-22,sqlp_tryparsedates,11.918393847333334,0.013805430703016874,11.914084625000001,12.856793333333334,0.2654393333333333,11.907257,11.933839917,83907 +0.136.0,2024-10-09-22,sqlp_tryparsedates_inferlen,1.9179708060000003,0.006179784729398966,1.9200773340000001,2.8641963333333336,0.25034033333333333,1.9110131670000001,1.922821917,521376 +0.136.0,2024-10-09-22,stats,1.909606736,0.0049196025849774,1.910509792,2.219391333333333,0.09494199999999998,1.904298166,1.91401225,523560 +0.136.0,2024-10-09-22,stats_create_cache,1.9097833056666669,0.004521451395342963,1.9118497090000002,2.2203649999999997,0.09286466666666666,1.904597875,1.912902333,523560 +0.136.0,2024-10-09-22,stats_everything,3.3554944723333335,0.04029503222196921,3.343843042,10.192412666666668,0.3932616666666666,3.322309,3.400331375,298063 +0.136.0,2024-10-09-22,stats_everything_create_cache,3.3900974583333334,0.018655247417147178,3.382474708,10.237715333333332,0.392744,3.376460667,3.411357,294985 +0.136.0,2024-10-09-22,stats_everything_index,1.3312123473333333,0.009204037936792824,1.32928925,10.825936333333331,0.5668519999999999,1.323121792,1.341226,751315 +0.136.0,2024-10-09-22,stats_everything_index_j1,6.614990667000001,0.15419707857364842,6.636688584,6.550748,0.21150966666666668,6.451093875,6.757189542,151172 +0.136.0,2024-10-09-22,stats_everything_index_j1_with_cache,0.007933236333333335,0.00015488744395958354,0.007865750000000001,0.004810333333333333,0.0024836666666666666,0.007823542000000001,0.008110417,125000000 +0.136.0,2024-10-09-22,stats_everything_index_with_cache,0.008150389000000001,0.00004450365283884046,0.008145625,0.004868666666666666,0.002627666666666667,0.008108459,0.008197083,125000000 +0.136.0,2024-10-09-22,stats_everything_infer_dates,7.172214625333333,0.001711103903247197,7.172136042,14.419449,0.4363456666666667,7.170544167,7.173963667,139431 +0.136.0,2024-10-09-22,stats_everything_infer_dates_index,1.892725361,0.026426721653890588,1.9041573330000001,17.084736,0.5867853333333334,1.8625072500000002,1.9115115,528262 +0.136.0,2024-10-09-22,stats_everything_infer_dates_index_with_cache,0.008059833333333334,0.00016426584580287267,0.007988208,0.004905333333333334,0.0025053333333333334,0.007943542000000001,0.00824775,125000000 +0.136.0,2024-10-09-22,stats_everything_j1,6.374932889,0.03861090214131202,6.378022792,6.5876529999999995,0.26058733333333334,6.334869875,6.411906,156863 +0.136.0,2024-10-09-22,stats_everything_sorted,3.3091379170000006,0.01562012204732685,3.309453625,9.845339666666666,0.40390433333333325,3.2933623340000002,3.324597792,302206 +0.136.0,2024-10-09-22,stats_everything_sorted_index,3.3277645556666666,0.006919087020449097,3.326532125,9.828958333333333,0.4096936666666666,3.3215445,3.335217042,300481 +0.136.0,2024-10-09-22,stats_index,0.2264774586666667,0.004532509862893005,0.224717459,2.293756,0.06589166666666667,0.22308891700000003,0.23162600000000003,4424779 +0.136.0,2024-10-09-22,stats_index_j1,1.8859975693333333,0.0014554704378813388,1.8864251250000001,1.8443606666666668,0.04049133333333333,1.884376208,1.887191375,530223 +0.136.0,2024-10-09-22,stats_index_j1_with_cache,0.007778472333333334,0.00011276332092632511,0.007799292,0.004687666666666666,0.002419333333333333,0.00765675,0.007879375000000001,125000000 +0.136.0,2024-10-09-22,stats_index_with_cache,0.007986986,0.000047691702244729985,0.007972625,0.004798,0.0025496666666666667,0.007948125,0.008040208,125000000 +0.136.0,2024-10-09-22,table,6.831468819666667,0.023255516463334465,6.827525417,5.733177333333333,1.0903183333333333,6.810437125,6.856443917,146391 +0.136.0,2024-10-09-22,to_datapackage,0.695291889,0.0036729801911510383,0.6936888750000001,2.0978336666666664,0.30246833333333334,0.6926928750000001,0.6994939170000001,1438849 +0.136.0,2024-10-09-22,to_sqlite,0.6869207916666668,0.006021595270961332,0.688884916,2.080145,0.29434066666666664,0.6801623750000001,0.691715084,1455604 +0.136.0,2024-10-09-22,to_xlsx,30.79641108333333,0.03361517688094782,30.809203667,29.30711633333333,3.063931,30.758277708,30.821751875,32472 +0.136.0,2024-10-09-22,tojsonl,5.255716042,0.02947630512929443,5.257766042,10.161964666666664,0.4150133333333333,5.22526825,5.284113834,190259 +0.136.0,2024-10-09-22,tojsonl_batchall,5.2791820136666665,0.018680752373076034,5.275491625,10.091462666666665,0.37808400000000003,5.262621875,5.299432541,189430 +0.136.0,2024-10-09-22,tojsonl_index,2.8958170693333334,0.07985544815714167,2.86844975,11.432261333333335,0.4399403333333334,2.8332435,2.985757958,345304 +0.136.0,2024-10-09-22,tojsonl_index_j1,8.995612166666666,0.019833368532575935,9.003292875,8.607582666666666,0.3818063333333333,8.973087125,9.0104565,111161 +0.136.0,2024-10-09-22,tojsonl_j1,9.050394528,0.04188608359892298,9.033382667,9.022415666666666,0.430021,9.019690875,9.098110042,110497 +0.136.0,2024-10-09-22,tojsonl_trim,5.348996528000001,0.006708534294621601,5.348959584,10.950814333333334,0.3859336666666667,5.342306542,5.355723458,186951 +0.136.0,2024-10-09-22,tojsonl_trim_j1,9.571418291666667,0.01507952192561961,9.56800225,9.526006333333333,0.4491516666666666,9.558339833,9.587912792000001,104482 +0.136.0,2024-10-09-22,transpose,2.6229285693333333,0.025532015383013567,2.611978792,2.4179046666666664,0.19915433333333332,2.60469775,2.652109166,381243 +0.136.0,2024-10-09-22,transpose_multipass,21.422549930333332,0.02144520581620947,21.414166625,19.688017333333335,1.7326546666666667,21.406562708,21.446920458,46679 +0.136.0,2024-10-09-22,validate,1.3454596946666666,0.011306645865111651,1.344570917,6.256567999999999,0.15226666666666666,1.334623667,1.3571845,743494 +0.136.0,2024-10-09-22,validate_batchall,1.2823367776666668,0.0075023406401299505,1.283977083,5.824079,0.11468199999999999,1.2741500000000001,1.28888325,780031 +0.136.0,2024-10-09-22,validate_batchall_index,1.2857204443333334,0.004130539026058984,1.286295208,5.745959666666667,0.12479633333333333,1.2813326250000001,1.2895335,777605 +0.136.0,2024-10-09-22,validate_dynenum,1.3203521386666666,0.004890651370770679,1.318326208,6.183933333333333,0.14744133333333334,1.3168,1.325930208,757576 +0.136.0,2024-10-09-22,validate_dynenum_batchall,1.272792861,0.0026021360365074747,1.273357416,5.749370666666667,0.11033633333333333,1.269954792,1.275066375,785546 +0.136.0,2024-10-09-22,validate_dynenum_batchall_index,1.271873194,0.006069566186101182,1.273125708,5.687784333333333,0.12177833333333332,1.2652750830000001,1.2772187910000001,786164 +0.136.0,2024-10-09-22,validate_dynenum_index,1.26976775,0.0012953332136516952,1.26936025,5.678887333333333,0.10425066666666666,1.268725167,1.2712178330000001,787402 +0.136.0,2024-10-09-22,validate_dynenum_no_schema,1.3317769723333333,0.00857000757395079,1.335231583,6.148984666666666,0.14937566666666666,1.322018834,1.3380805,750751 +0.136.0,2024-10-09-22,validate_dynenum_no_schema_index,1.267980167,0.004467809592701886,1.269947584,5.694462333333333,0.102148,1.262866292,1.271126625,788644 +0.136.0,2024-10-09-22,validate_dynenum_valid_output,2.2017550413333336,0.0028643772618533727,2.202526583,7.002077333333333,0.19268933333333335,2.198583916,2.204154625,454133 +0.136.0,2024-10-09-22,validate_dynenum_valid_output_index,2.1348443749999997,0.0037203540786369966,2.136084209,6.538800666666667,0.1434173333333333,2.130662416,2.1377865,468384 +0.136.0,2024-10-09-22,validate_index,1.2818570833333334,0.0032141490070814053,1.2826240420000001,5.739469666666666,0.10224566666666667,1.278328833,1.284618375,780031 +0.136.0,2024-10-09-22,validate_no_schema,0.4880847496666667,0.0036591216167233724,0.489340916,0.4425666666666666,0.04411433333333333,0.48396300000000003,0.490950333,2049180 +0.136.0,2024-10-09-22,validate_no_schema_index,0.49482838866666673,0.0009017353931089343,0.49446141600000004,0.45398833333333327,0.039464,0.49416800000000005,0.49585575000000004,2020202 +0.136.0,2024-10-09-22,validate_valid_output,2.2125143473333337,0.004183940417602296,2.21488575,7.101555,0.1944523333333333,2.207683417,2.214973875,451875 +0.136.0,2024-10-09-22,validate_valid_output_index,2.143425805666667,0.008207043638664849,2.138873375,6.604312666666666,0.13987033333333332,2.138503959,2.152900083,466636 diff --git a/scripts/results/latest_run_info.tsv b/scripts/results/latest_run_info.tsv index 8e37c2cac..311f99e46 100644 --- a/scripts/results/latest_run_info.tsv +++ b/scripts/results/latest_run_info.tsv @@ -1,2 +1,2 @@ version tstamp logtime bm_version platform cores mem binary kind argument total_count wo_index_count with_index_count warmup_runs benchmark_runs elapsed_secs total_mean qsv_env version_info -0.135.0 2024-09-24-16 2024-09-24-16-09-43 5.0.0 aarch64-apple-darwin 12 34359738368 qsv prebuilt 233 179 54 2 3 4041 800.027 No qsv-relevant environment variables set. qsv 0.135.0-mimalloc-apply;fetch;foreach;geocode;Luau 0.640;to;polars-0.43.1-py-1.8.1;self_update-12-12;25.60 GiB-0 B-18.97 GiB-32.00 GiB (aarch64-apple-darwin compiled with Rust 1.81) prebuilt +0.136.0 2024-10-09-22 2024-10-09-22-56-27 5.1.0 aarch64-apple-darwin 12 34359738368 qsv prebuilt 234 180 54 2 3 3990 788.815 No qsv-relevant environment variables set. qsv 0.136.0-mimalloc-apply;fetch;foreach;geocode;Luau 0.640;to;polars-0.43.1-ee9bafb;self_update-12-12;25.60 GiB-0 B-19.70 GiB-32.00 GiB (aarch64-apple-darwin compiled with Rust 1.81) prebuilt diff --git a/scripts/results/run_info_history.tsv b/scripts/results/run_info_history.tsv index 150f4ada9..618fc861a 100644 --- a/scripts/results/run_info_history.tsv +++ b/scripts/results/run_info_history.tsv @@ -1,4 +1,5 @@ version tstamp logtime bm_version platform cores mem binary kind argument total_count wo_index_count with_index_count warmup_runs benchmark_runs elapsed_secs total_mean qsv_env version_info +0.136.0 2024-10-09-22 2024-10-09-22-56-27 5.1.0 aarch64-apple-darwin 12 34359738368 qsv prebuilt 234 180 54 2 3 3990 788.815 No qsv-relevant environment variables set. qsv 0.136.0-mimalloc-apply;fetch;foreach;geocode;Luau 0.640;to;polars-0.43.1-ee9bafb;self_update-12-12;25.60 GiB-0 B-19.70 GiB-32.00 GiB (aarch64-apple-darwin compiled with Rust 1.81) prebuilt 0.135.0 2024-09-27-15 2024-09-27-15-52-37 5.1.0 aarch64-apple-darwin 12 34359738368 qsv prebuilt apply_op_sentiment 1 1 0 2 3 18 3.595 No qsv-relevant environment variables set. qsv 0.135.0-mimalloc-apply;fetch;foreach;geocode;Luau 0.640;to;polars-0.43.1-py-1.8.1;self_update-12-12;25.60 GiB-0 B-18.13 GiB-32.00 GiB (aarch64-apple-darwin compiled with Rust 1.81) prebuilt 0.135.0 2024-09-24-16 2024-09-24-16-09-43 5.0.0 aarch64-apple-darwin 12 34359738368 qsv prebuilt 233 179 54 2 3 4041 800.027 No qsv-relevant environment variables set. qsv 0.135.0-mimalloc-apply;fetch;foreach;geocode;Luau 0.640;to;polars-0.43.1-py-1.8.1;self_update-12-12;25.60 GiB-0 B-18.97 GiB-32.00 GiB (aarch64-apple-darwin compiled with Rust 1.81) prebuilt 0.134.0 2024-09-11-04 2024-09-11-04-55-36 4.12.0 aarch64-apple-darwin 12 34359738368 qsv prebuilt 225 175 50 2 3 7272 824.618 No qsv-relevant environment variables set. qsv 0.134.0-mimalloc-apply;fetch;foreach;geocode;Luau 0.640;to;polars-0.42.0-fe04390;self_update-12-12;25.60 GiB-931.62 MiB-18.35 GiB-32.00 GiB (aarch64-apple-darwin compiled with Rust 1.81) prebuilt From 96b9ac9d4c923420bb0b90e8c23fb35544026d00 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 10 Oct 2024 06:12:54 -0400 Subject: [PATCH 003/119] `deps`: remove old csvs_convert patch entry; update lock file --- Cargo.lock | 36 ++++++++++++++++++------------------ Cargo.toml | 3 --- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 54c4cc450..d4c7be277 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -556,9 +556,9 @@ dependencies = [ [[package]] name = "ashpd" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfe7e0dd0ac5a401dc116ed9f9119cf9decc625600474cb41f0fc0a0050abc9a" +checksum = "4d43c03d9e36dd40cab48435be0b09646da362c278223ca535493877b2c1dee9" dependencies = [ "async-fs", "async-net", @@ -3481,9 +3481,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.70" +version = "0.3.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" +checksum = "0cb94a0ffd3f3ee755c20f7d8752f45cac88605a4dcf808abcff72873296ec7b" dependencies = [ "wasm-bindgen", ] @@ -7575,9 +7575,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.93" +version = "0.2.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +checksum = "ef073ced962d62984fb38a36e5fdc1a2b23c9e0e1fa0689bb97afa4202ef6887" dependencies = [ "cfg-if", "once_cell", @@ -7586,9 +7586,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.93" +version = "0.2.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +checksum = "c4bfab14ef75323f4eb75fa52ee0a3fb59611977fd3240da19b2cf36ff85030e" dependencies = [ "bumpalo", "log", @@ -7601,9 +7601,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.43" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" +checksum = "65471f79c1022ffa5291d33520cbbb53b7687b01c2f8e83b57d102eed7ed479d" dependencies = [ "cfg-if", "js-sys", @@ -7613,9 +7613,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.93" +version = "0.2.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +checksum = "a7bec9830f60924d9ceb3ef99d55c155be8afa76954edffbb5936ff4509474e7" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7623,9 +7623,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.93" +version = "0.2.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +checksum = "4c74f6e152a76a2ad448e223b0fc0b6b5747649c3d769cc6bf45737bf97d0ed6" dependencies = [ "proc-macro2", "quote", @@ -7636,9 +7636,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.93" +version = "0.2.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +checksum = "a42f6c679374623f295a8623adfe63d9284091245c3504bde47c17a3ce2777d9" [[package]] name = "wasm-streams" @@ -7740,9 +7740,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.70" +version = "0.3.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" +checksum = "44188d185b5bdcae1052d08bcbcf9091a5524038d4572cc4f4f2bb9d5554ddd9" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index c2012627e..e5a3f4cbe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -280,9 +280,6 @@ csv-index = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-opt # use our csvlens fork with latest dependencies, including arrow 53 upstream, with unreleased lexical-core fix csvlens = { git = "https://github.com/jqnatividad/csvlens", branch = "dependency-upgrades-lexical-core_fix" } -# upstream csvs_convert has some old dependencies -# csvs_convert = { git = "https://github.com/jqnatividad/csvs_convert", branch = "bump-more-dependencies" } - # needed as dynfmt doesn't work in release mode without this # see https://github.com/jan-auer/dynfmt/pull/9 dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" } From d56ee9949d7a0de99f0b0c1e9df4190f0b864c8c Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 10 Oct 2024 07:04:16 -0400 Subject: [PATCH 004/119] `validate`: typo --- src/cmd/validate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index 8888559ab..e87f4911b 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -24,7 +24,7 @@ qsv supports a custom format - `currency`. This format will only accept a valid 1. ISO Currency Symbol (optional): This is the ISO 4217 three-character code or currency symbol (e.g. USD, EUR, JPY, $, €, ¥, etc.) - 2. Amount: This is the numerical value of the currency.More than 2 decimal places are allowed. + 2. Amount: This is the numerical value of the currency. More than 2 decimal places are allowed. 3. Formats: Valid currency formats include: Standard: $1,000.00 or USD1000.00 Negative amounts: ($100.00) or -$100.00 From 7796b8bc5210d7d858a1a575f51dfd279818d1a8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 10 Oct 2024 19:58:42 +0000 Subject: [PATCH 005/119] build(deps): bump calamine from 0.26.0 to 0.26.1 Bumps [calamine](https://github.com/tafia/calamine) from 0.26.0 to 0.26.1. - [Changelog](https://github.com/tafia/calamine/blob/master/Changelog.md) - [Commits](https://github.com/tafia/calamine/compare/v0.26.0...v0.26.1) --- updated-dependencies: - dependency-name: calamine dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4c7be277..f4dd659d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1137,9 +1137,9 @@ checksum = "ade8366b8bd5ba243f0a58f036cc0ca8a2f069cff1a2351ef1cac6b083e16fc0" [[package]] name = "calamine" -version = "0.26.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b262d93265ddca1ab8dcc57fadd5fe365ed52e717ac7a24a9eabf990cf9566a" +checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1" dependencies = [ "byteorder", "chrono", From cc1e603a9e041e1200facf573b6f7e3a119e9f42 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 05:15:00 -0400 Subject: [PATCH 006/119] refactor `optimal_batch_size` to require indexed CSV files --- src/util.rs | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/util.rs b/src/util.rs index fc6b4fbe2..b12b68a51 100644 --- a/src/util.rs +++ b/src/util.rs @@ -2210,6 +2210,7 @@ pub fn csv_to_jsonl( } /// get the optimal batch size +/// if CSV is not indexed and ROW_COUNT is not set, return DEFAULT_BATCH_SIZE /// if batch_size is 0, return the number of rows in the CSV, effectively disabling batching /// if batch_size is 1, force batch_size to be set to "optimal_size", even though /// its not recommended (number of rows is too small for parallel processing) @@ -2217,23 +2218,21 @@ pub fn csv_to_jsonl( /// failing everything above, return the requested batch_size #[inline] pub fn optimal_batch_size(rconfig: &Config, batch_size: usize, num_jobs: usize) -> usize { - if batch_size < DEFAULT_BATCH_SIZE { - return DEFAULT_BATCH_SIZE; - } - // if ROW_COUNT is not known, even if the input is not indexed, we still determine - // optimal batch size if polars is enabled, as its fast even without an index. - // Otherwise, we return the default batch size, as the perf hit of counting rows is too high - // without an index with polars disabled - #[cfg(not(feature = "polars"))] - if ROW_COUNT.get().is_none() { + if batch_size > 1 && batch_size < DEFAULT_BATCH_SIZE { return DEFAULT_BATCH_SIZE; } - let num_rows = if let Ok(rows) = count_rows(rconfig) { - rows as usize - } else { - return DEFAULT_BATCH_SIZE; + let num_rows = match ROW_COUNT.get() { + Some(count) => count.unwrap() as usize, + None => { + if let Ok(Some(idx)) = rconfig.indexed() { + idx.count() as usize + } else { + return DEFAULT_BATCH_SIZE; + } + }, }; + if batch_size == 0 { // disable batching, handle all rows in one batch num_rows From 7c57c27a34efd9862a61c0669fe56d310ff4c698 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 05:36:10 -0400 Subject: [PATCH 007/119] `deps`: update lock file --- Cargo.lock | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f4dd659d5..8b2a2a96b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -612,9 +612,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429" +checksum = "998282f8f49ccd6116b0ed8a4de0fbd3151697920e7c7533416d6e25e76434a7" dependencies = [ "brotli 7.0.0", "flate2", @@ -1168,9 +1168,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.28" +version = "1.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1" +checksum = "58e804ac3194a48bb129643eb1d62fcc20d18c6b8c181704489353d13120bcd1" dependencies = [ "jobserver", "libc", @@ -3481,9 +3481,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.71" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cb94a0ffd3f3ee755c20f7d8752f45cac88605a4dcf808abcff72873296ec7b" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" dependencies = [ "wasm-bindgen", ] @@ -7575,9 +7575,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.94" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef073ced962d62984fb38a36e5fdc1a2b23c9e0e1fa0689bb97afa4202ef6887" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" dependencies = [ "cfg-if", "once_cell", @@ -7586,9 +7586,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.94" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4bfab14ef75323f4eb75fa52ee0a3fb59611977fd3240da19b2cf36ff85030e" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" dependencies = [ "bumpalo", "log", @@ -7601,9 +7601,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65471f79c1022ffa5291d33520cbbb53b7687b01c2f8e83b57d102eed7ed479d" +checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" dependencies = [ "cfg-if", "js-sys", @@ -7613,9 +7613,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.94" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7bec9830f60924d9ceb3ef99d55c155be8afa76954edffbb5936ff4509474e7" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7623,9 +7623,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.94" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c74f6e152a76a2ad448e223b0fc0b6b5747649c3d769cc6bf45737bf97d0ed6" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", @@ -7636,9 +7636,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.94" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a42f6c679374623f295a8623adfe63d9284091245c3504bde47c17a3ce2777d9" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] name = "wasm-streams" @@ -7740,9 +7740,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.71" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44188d185b5bdcae1052d08bcbcf9091a5524038d4572cc4f4f2bb9d5554ddd9" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" dependencies = [ "js-sys", "wasm-bindgen", From 816056227275f57bdd6665874a72fea6265da557 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:04:58 -0400 Subject: [PATCH 008/119] `tests`: add very long string for crc32 test --- tests/test_apply.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_apply.rs b/tests/test_apply.rs index e01563d2d..00f8613ca 100644 --- a/tests/test_apply.rs +++ b/tests/test_apply.rs @@ -2675,7 +2675,13 @@ fn apply_crc32() { let wrk = Workdir::new("apply_crc32"); wrk.create( "data.csv", - vec![svec!["name"], svec!["John"], svec!["Sue"], svec!["Hopkins"]], + vec![ + svec!["name"], + svec!["John"], + svec!["Sue"], + svec!["Hopkins"], + svec!["TheQuickBrownFoxJumpedOverTheLazyDogByTheZigzagQuarrySite 1234567890 &^#@09"], + ], ); let mut cmd = wrk.command("apply"); cmd.arg("operations") @@ -2691,6 +2697,10 @@ fn apply_crc32() { svec!["John", "2437433000"], svec!["Sue", "4264251807"], svec!["Hopkins", "1940610850"], + svec![ + "TheQuickBrownFoxJumpedOverTheLazyDogByTheZigzagQuarrySite 1234567890 &^#@09", + "4056627688" + ], ]; assert_eq!(got, expected); From 147a3750f0fcc2440cd5e60fbbc2066b4275a8fc Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:22:30 -0400 Subject: [PATCH 009/119] `deps`: use our patched, modernized fork of crc32fast which is 30% faster 2021 edition; MSRV 1.81; clippy lint suggestions applied; refactored several functions for performance --- Cargo.lock | 3 +-- Cargo.toml | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8b2a2a96b..cb2e53f1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1624,8 +1624,7 @@ checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" [[package]] name = "crc32fast" version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +source = "git+https://github.com/jqnatividad/rust-crc32fast?branch=modernize#895e18662e8416f154b87870f533fe3586f3391b" dependencies = [ "cfg-if", ] diff --git a/Cargo.toml b/Cargo.toml index e5a3f4cbe..67e44c2ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -280,6 +280,9 @@ csv-index = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-opt # use our csvlens fork with latest dependencies, including arrow 53 upstream, with unreleased lexical-core fix csvlens = { git = "https://github.com/jqnatividad/csvlens", branch = "dependency-upgrades-lexical-core_fix" } +# modernized fork of crc32fast, 2021 edition, MSRV 1.81, select clippy lint suggestions applied +crc32fast = { git = "https://github.com/jqnatividad/rust-crc32fast", branch = "modernize" } + # needed as dynfmt doesn't work in release mode without this # see https://github.com/jan-auer/dynfmt/pull/9 dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" } From c34a8cf2489939f337bca1d1512c324bf8cef54d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:32:04 -0400 Subject: [PATCH 010/119] `tests`: allow qsvlite test manual dispatch [skip ci] --- .github/workflows/rust-qsvlite.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/rust-qsvlite.yml b/.github/workflows/rust-qsvlite.yml index aee733007..72b31c9cd 100644 --- a/.github/workflows/rust-qsvlite.yml +++ b/.github/workflows/rust-qsvlite.yml @@ -5,6 +5,7 @@ on: branches: [ master ] pull_request: branches: [ master ] + workflow_dispatch: concurrency: group: ci-qsvlite-tests-${{ github.ref }}-1 From 42fb211dfec32af9a4b20b8d9b196f744b33ef2a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:59:00 -0400 Subject: [PATCH 011/119] `deps`: update crc32fast with fixed patch; scc and sdd --- Cargo.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cb2e53f1c..c0d6b615f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1624,7 +1624,7 @@ checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" [[package]] name = "crc32fast" version = "1.4.2" -source = "git+https://github.com/jqnatividad/rust-crc32fast?branch=modernize#895e18662e8416f154b87870f533fe3586f3391b" +source = "git+https://github.com/jqnatividad/rust-crc32fast?branch=modernize#2ff4d6b052ef3a3fd4525272fd5c598cbd417d9a" dependencies = [ "cfg-if", ] @@ -6279,9 +6279,9 @@ dependencies = [ [[package]] name = "scc" -version = "2.2.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836f1e0f4963ef5288b539b643b35e043e76a32d0f4e47e67febf69576527f50" +checksum = "553f8299af7450cda9a52d3a370199904e7a46b5ffd1bef187c4a6af3bb6db69" dependencies = [ "sdd", ] @@ -6318,9 +6318,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "sdd" -version = "3.0.3" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a7b59a5d9b0099720b417b6325d91a52cbf5b3dcb5041d864be53eefa58abc" +checksum = "49c1eeaf4b6a87c7479688c6d52b9f1153cedd3c489300564f932b065c6eab95" [[package]] name = "security-framework" From 6e9bb55e0a84d7298c59e22c17cf793a8519e63f Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 18:44:05 -0400 Subject: [PATCH 012/119] standardize sponsor message; remove unneeded import of crossbeam_channel --- src/config.rs | 4 ++++ src/main.rs | 32 ++++++++++---------------------- src/maindp.rs | 38 +++++++++++++++----------------------- src/mainlite.rs | 34 +++++++++++++--------------------- 4 files changed, 42 insertions(+), 66 deletions(-) diff --git a/src/config.rs b/src/config.rs index dc37c1c5c..41a9d01a1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -29,6 +29,10 @@ const NO_INDEX_WARNING_FILESIZE: u64 = 100_000_000; // 100MB // so we don't have to keep checking if the index has been created static AUTO_INDEXED: AtomicBool = AtomicBool::new(false); +pub static SPONSOR_MESSAGE: &str = r#"sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) +Need a UI & more advanced data-wrangling? Upgrade to qsv pro (https://qsvpro.dathere.com) +"#; + #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct Delimiter(pub u8); diff --git a/src/main.rs b/src/main.rs index 9eafe4bb1..25563bfad 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,7 +33,6 @@ ) )] -extern crate crossbeam_channel as channel; use std::{env, io, time::Instant}; extern crate qsv_docopt as docopt; @@ -41,7 +40,10 @@ use docopt::Docopt; use rand::Rng; use serde::Deserialize; -use crate::clitypes::{CliError, CliResult, QsvExitCode, CURRENT_COMMAND}; +use crate::{ + clitypes::{CliError, CliResult, QsvExitCode, CURRENT_COMMAND}, + config::SPONSOR_MESSAGE, +}; #[cfg(feature = "mimalloc")] #[global_allocator] @@ -72,10 +74,7 @@ Options: -h, --help Display this message -h Display the command help message -v, --version Print version info, mem allocator, features installed, - max_jobs, num_cpus, build info then exit - -sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) -"#; + max_jobs, num_cpus, build info then exit"#; #[derive(Deserialize)] struct Args { @@ -214,7 +213,7 @@ fn main() -> QsvExitCode { }, }; - let args: Args = Docopt::new(USAGE) + let args: Args = Docopt::new(format!("{USAGE}\n\n{SPONSOR_MESSAGE}")) .and_then(|d| { d.options_first(true) .version(Some(util::version())) @@ -227,13 +226,7 @@ fn main() -> QsvExitCode { } if args.flag_list { - wout!("Installed commands ({num_commands}):"); - wout!( - r#"{enabled_commands} - -sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) -"# - ); + wout!("Installed commands ({num_commands}):\n{enabled_commands}\n\n{SPONSOR_MESSAGE}"); util::log_end(qsv_args, now); return QsvExitCode::Good; } else if args.flag_envlist { @@ -252,13 +245,8 @@ sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) match args.arg_command { None => { werr!( - r#"qsv is a suite of CSV command line utilities. - -Please choose one of the following {num_commands} commands: -{enabled_commands} - -sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) -"# + "qsv is a suite of CSV command line utilities.\n\nPlease choose one of the \ + following {num_commands} commands:\n{enabled_commands}\n\n{SPONSOR_MESSAGE}" ); // if no command is specified, auto-check for updates 10% of the time @@ -459,7 +447,7 @@ impl Command { Command::Geocode => cmd::geocode::run(argv), Command::Headers => cmd::headers::run(argv), Command::Help => { - wout!("{USAGE}"); + wout!("{USAGE}\n\n{SPONSOR_MESSAGE}"); util::qsv_check_for_update(true, false)?; Ok(()) }, diff --git a/src/maindp.rs b/src/maindp.rs index 91f3f5ccb..dbdb86a68 100644 --- a/src/maindp.rs +++ b/src/maindp.rs @@ -32,14 +32,16 @@ clippy::option_if_let_else, ) )] -extern crate crossbeam_channel as channel; use std::{env, io, time::Instant}; extern crate qsv_docopt as docopt; use docopt::Docopt; use serde::Deserialize; -use crate::clitypes::{CliError, CliResult, QsvExitCode, CURRENT_COMMAND}; +use crate::{ + clitypes::{CliError, CliResult, QsvExitCode, CURRENT_COMMAND}, + config::SPONSOR_MESSAGE, +}; #[cfg(feature = "mimalloc")] #[global_allocator] @@ -49,9 +51,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; #[global_allocator] static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; -macro_rules! command_list { - () => { - " +static COMMAND_LIST: &str = r#" applydp Apply series of transformations to a column count Count records datefmt Format date/datetime strings @@ -86,12 +86,8 @@ macro_rules! command_list { stats Infer data types and compute summary statistics validate Validate CSV data for RFC4180-compliance or with JSON Schema - NOTE: qsvdp ignores the --progressbar option for all commands. + NOTE: qsvdp ignores the --progressbar option for all commands."#; -sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) -" - }; -} mod clitypes; mod cmd; mod config; @@ -113,10 +109,7 @@ Options: -h, --help Display this message -h Display the command help message -v, --version Print version info, mem allocator, features installed, - max_jobs, num_cpus, build info then exit - -sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) -"#; + max_jobs, num_cpus, build info then exit"#; #[derive(Deserialize)] struct Args { arg_command: Option, @@ -136,7 +129,7 @@ fn main() -> QsvExitCode { }, }; - let args: Args = Docopt::new(USAGE) + let args: Args = Docopt::new(format!("{USAGE}\n\n{SPONSOR_MESSAGE}")) .and_then(|d| { d.options_first(true) .version(Some(util::version())) @@ -149,7 +142,7 @@ fn main() -> QsvExitCode { } if args.flag_list { - wout!(concat!("Installed commands:", command_list!())); + wout!("Installed commands:{}\n\n{}", COMMAND_LIST, SPONSOR_MESSAGE); util::log_end(qsv_args, now); return QsvExitCode::Good; } else if args.flag_envlist { @@ -167,12 +160,11 @@ fn main() -> QsvExitCode { } match args.arg_command { None => { - werr!(concat!( - "qsvdp is a suite of CSV command line utilities optimized for Datapusher+. - -Please choose one of the following commands:", - command_list!() - )); + werr!( + "qsvdp is a suite of CSV command line utilities optimized for \ + Datapusher+.\n\nPlease choose one of the following \ + commands:\n{COMMAND_LIST}\n\n{SPONSOR_MESSAGE}", + ); util::log_end(qsv_args, now); QsvExitCode::Good @@ -308,7 +300,7 @@ impl Command { Command::Frequency => cmd::frequency::run(argv), Command::Headers => cmd::headers::run(argv), Command::Help => { - wout!("{USAGE}"); + wout!("{USAGE}\n\n{SPONSOR_MESSAGE}"); util::qsv_check_for_update(true, false)?; Ok(()) }, diff --git a/src/mainlite.rs b/src/mainlite.rs index 1445101a1..50682e467 100644 --- a/src/mainlite.rs +++ b/src/mainlite.rs @@ -1,4 +1,3 @@ -extern crate crossbeam_channel as channel; use std::{env, io, time::Instant}; extern crate qsv_docopt as docopt; @@ -6,7 +5,10 @@ use docopt::Docopt; use rand::Rng; use serde::Deserialize; -use crate::clitypes::{CliError, CliResult, QsvExitCode, CURRENT_COMMAND}; +use crate::{ + clitypes::{CliError, CliResult, QsvExitCode, CURRENT_COMMAND}, + config::SPONSOR_MESSAGE, +}; #[cfg(feature = "mimalloc")] #[global_allocator] @@ -16,9 +18,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; #[global_allocator] static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; -macro_rules! command_list { - () => { - " +static COMMAND_LIST: &str = r#" behead Drop header from CSV file cat Concatenate by row or column clipboard Provide input from clipboard or output to clipboard @@ -68,12 +68,8 @@ macro_rules! command_list { table Align CSV data into columns tojsonl Convert CSV to newline-delimited JSON transpose Transpose rows/columns of CSV data - validate Validate CSV data for RFC4180-compliance or with JSON Schema + validate Validate CSV data for RFC4180-compliance or with JSON Schema"#; -sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) -" - }; -} mod clitypes; mod cmd; mod config; @@ -96,8 +92,6 @@ Options: -h Display the command help message -v, --version Print version info, mem allocator, features installed, max_jobs, num_cpus, build info then exit - -sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) "#; #[derive(Deserialize)] @@ -119,7 +113,7 @@ fn main() -> QsvExitCode { }, }; - let args: Args = Docopt::new(USAGE) + let args: Args = Docopt::new(format!("{USAGE}\n\n{SPONSOR_MESSAGE}")) .and_then(|d| { d.options_first(true) .version(Some(util::version())) @@ -132,7 +126,7 @@ fn main() -> QsvExitCode { } if args.flag_list { - wout!(concat!("Installed commands:", command_list!())); + wout!("Installed commands:{}\n\n{}", COMMAND_LIST, SPONSOR_MESSAGE); util::log_end(qsv_args, now); return QsvExitCode::Good; } else if args.flag_envlist { @@ -150,12 +144,10 @@ fn main() -> QsvExitCode { } match args.arg_command { None => { - werr!(concat!( - "qsvlite is a suite of CSV command line utilities. - -Please choose one of the following commands:", - command_list!() - )); + werr!( + "qsvlite is a suite of CSV command line utilities.\n\nPlease choose one of the \ + following commands:\n{COMMAND_LIST}\n\n{SPONSOR_MESSAGE}", + ); // if no command is specified, auto-check for updates 10% of the time let mut rng = rand::thread_rng(); //DevSkim: ignore DS148264 @@ -320,7 +312,7 @@ impl Command { Command::Frequency => cmd::frequency::run(argv), Command::Headers => cmd::headers::run(argv), Command::Help => { - wout!("{USAGE}"); + wout!("{USAGE}\n\n{SPONSOR_MESSAGE}"); util::qsv_check_for_update(true, false)?; Ok(()) }, From c307f8d81378e17854c0c967f25767f9396570f5 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 18:44:51 -0400 Subject: [PATCH 013/119] `stats`, `frequency`: import crossbeam_channel only in these two modules --- src/cmd/frequency.rs | 3 ++- src/cmd/stats.rs | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cmd/frequency.rs b/src/cmd/frequency.rs index f68907c13..33c8826bc 100644 --- a/src/cmd/frequency.rs +++ b/src/cmd/frequency.rs @@ -121,6 +121,7 @@ Common options: use std::{fs, io, sync::OnceLock}; +use crossbeam_channel; use indicatif::HumanCount; use rust_decimal::prelude::*; use serde::Deserialize; @@ -391,7 +392,7 @@ impl Args { let nchunks = util::num_of_chunks(idx_count, chunk_size); let pool = ThreadPool::new(njobs); - let (send, recv) = channel::bounded(0); + let (send, recv) = crossbeam_channel::bounded(0); for i in 0..nchunks { let (send, args, sel) = (send.clone(), self.clone(), sel.clone()); pool.execute(move || { diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index ef5addc5d..4896da189 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -246,6 +246,7 @@ use std::{ sync::OnceLock, }; +use crossbeam_channel; use itertools::Itertools; use qsv_dateparser::parse_with_preference; use serde::{Deserialize, Serialize}; @@ -933,7 +934,7 @@ impl Args { let nchunks = util::num_of_chunks(idx_count as usize, chunk_size); let pool = ThreadPool::new(util::njobs(self.flag_jobs)); - let (send, recv) = channel::bounded(0); + let (send, recv) = crossbeam_channel::bounded(0); for i in 0..nchunks { let (send, args, sel) = (send.clone(), self.clone(), sel.clone()); pool.execute(move || { @@ -961,7 +962,7 @@ impl Args { let pool = ThreadPool::new(util::njobs(self.flag_jobs)); let mut results = Vec::with_capacity(stats.len()); for mut stat in stats { - let (send, recv) = channel::bounded(0); + let (send, recv) = crossbeam_channel::bounded(0); results.push(recv); pool.execute(move || { // safety: this will only return an Error if the channel has been disconnected From eea0f64ce63d34e591d082d95ab60eeff95802df Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 19:01:07 -0400 Subject: [PATCH 014/119] use regular crc32fast crate; self-hosted runner is getting confused with which crate to use --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 67e44c2ca..a2c452984 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -281,7 +281,7 @@ csv-index = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-opt csvlens = { git = "https://github.com/jqnatividad/csvlens", branch = "dependency-upgrades-lexical-core_fix" } # modernized fork of crc32fast, 2021 edition, MSRV 1.81, select clippy lint suggestions applied -crc32fast = { git = "https://github.com/jqnatividad/rust-crc32fast", branch = "modernize" } +# crc32fast = { git = "https://github.com/jqnatividad/rust-crc32fast", branch = "modernize" } # needed as dynfmt doesn't work in release mode without this # see https://github.com/jan-auer/dynfmt/pull/9 From e93c475cda3b799b2a26065cfc58aceb4ef8026e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 11 Oct 2024 19:01:44 -0400 Subject: [PATCH 015/119] `deps`: update lock file --- Cargo.lock | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c0d6b615f..0527421ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1624,7 +1624,8 @@ checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" [[package]] name = "crc32fast" version = "1.4.2" -source = "git+https://github.com/jqnatividad/rust-crc32fast?branch=modernize#2ff4d6b052ef3a3fd4525272fd5c598cbd417d9a" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ "cfg-if", ] @@ -4527,9 +4528,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pathdiff" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd" +checksum = "d61c5ce1153ab5b689d0c074c4e7fc613e942dfb7dd9eea5ab202d2ad91fe361" [[package]] name = "pbkdf2" From 380f11a3569d73c44ff108e7ae5efef396186967 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 12 Oct 2024 09:53:46 +0000 Subject: [PATCH 016/119] build(deps): bump pyo3 from 0.22.3 to 0.22.4 Bumps [pyo3](https://github.com/pyo3/pyo3) from 0.22.3 to 0.22.4. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/commits) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0527421ca..876aa0a98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5372,9 +5372,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15ee168e30649f7f234c3d49ef5a7a6cbf5134289bc46c29ff3155fa3221c225" +checksum = "00e89ce2565d6044ca31a3eb79a334c3a79a841120a98f64eea9f579564cb691" dependencies = [ "cfg-if", "indoc", @@ -5390,9 +5390,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e61cef80755fe9e46bb8a0b8f20752ca7676dcc07a5277d8b7768c6172e529b3" +checksum = "d8afbaf3abd7325e08f35ffb8deb5892046fcb2608b703db6a583a5ba4cea01e" dependencies = [ "once_cell", "target-lexicon", @@ -5400,9 +5400,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ce096073ec5405f5ee2b8b31f03a68e02aa10d5d4f565eca04acc41931fa1c" +checksum = "ec15a5ba277339d04763f4c23d85987a5b08cbb494860be141e6a10a8eb88022" dependencies = [ "libc", "pyo3-build-config", @@ -5410,9 +5410,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2440c6d12bc8f3ae39f1e775266fa5122fd0c8891ce7520fa6048e683ad3de28" +checksum = "15e0f01b5364bcfbb686a52fc4181d412b708a68ed20c330db9fc8d2c2bf5a43" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -5422,9 +5422,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1be962f0e06da8f8465729ea2cb71a416d2257dff56cbe40a70d3e62a93ae5d1" +checksum = "a09b550200e1e5ed9176976d0060cbc2ea82dc8515da07885e7b8153a85caacb" dependencies = [ "heck 0.5.0", "proc-macro2", From e9ccec71b6a170adb88e7333f6dbeffa555d08fa Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 09:51:34 -0400 Subject: [PATCH 017/119] `docs`: document config helper functions [skip ci] --- src/config.rs | 103 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/src/config.rs b/src/config.rs index 41a9d01a1..e68d0917f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -105,6 +105,38 @@ pub trait SeekRead: io::Seek + io::Read {} impl SeekRead for T {} impl Config { + /// Creates a new `Config` instance with default settings and optional file path. + /// + /// # Arguments + /// + /// * `path` - An optional reference to a `String` representing the file path. + /// + /// # Returns + /// + /// A new `Config` instance. + /// + /// # Details + /// + /// This function initializes a `Config` with the following behavior: + /// - Uses env var `QSV_DEFAULT_DELIMITER` for default delimiter, or ',' if not set + /// - Determines delimiter and Snappy compression based on file extension. + /// - Supports sniffing delimiter and preamble rows if `QSV_SNIFF_DELIMITER` or + /// `QSV_SNIFF_PREAMBLE` is set. + /// - Sets comment character from `QSV_COMMENT_CHAR` environment variable. + /// - Sets headers behavior based on `QSV_NO_HEADERS` environment variable. + /// - Configures various other settings from environment variables. + /// + /// # Environment Variables + /// + /// - `QSV_DEFAULT_DELIMITER`: Sets the default delimiter. + /// - `QSV_SNIFF_DELIMITER` or `QSV_SNIFF_PREAMBLE`: Enables sniffing of delimiter and preamble + /// rows. + /// - `QSV_COMMENT_CHAR`: Sets the comment character. + /// - `QSV_NO_HEADERS`: Determines if the file has headers. + /// - `QSV_AUTOINDEX_SIZE`: Sets the auto-index size. + /// - `QSV_PREFER_DMY`: Sets date format preference. + /// - `QSV_RDR_BUFFER_CAPACITY`: Sets read buffer capacity. + /// - `QSV_WTR_BUFFER_CAPACITY`: Sets write buffer capacity. pub fn new(path: Option<&String>) -> Config { let default_delim = match env::var("QSV_DEFAULT_DELIMITER") { Ok(delim) => Delimiter::decode_delimiter(&delim).unwrap().as_byte(), @@ -305,6 +337,20 @@ impl Config { } #[inline] + /// Returns a `Selection` based on the config's `select_columns` & the first record of the CSV. + /// + /// # Arguments + /// + /// * `first_record` - A reference to the first `ByteRecord` of the CSV. + /// + /// # Returns + /// + /// * `Result` - A `Selection` if successful, otherwise, an error msg + /// + /// # Errors + /// + /// This function will return an error if: + /// * The `Config` has no `SelectColumns` (i.e., `Config::select` was not called). pub fn selection(&self, first_record: &csv::ByteRecord) -> Result { match self.select_columns { None => fail!("Config has no 'SelectColumns'. Did you call Config::select?"), @@ -312,6 +358,20 @@ impl Config { } } + /// Writes the headers from a CSV reader to a CSV writer. + /// + /// This function reads the headers from the given CSV reader and writes them to the CSV writer, + /// but only if the `no_headers` flag is not set. If the headers are empty, nothing is written. + /// + /// # Arguments + /// + /// * `r` - A mutable reference to a CSV reader. + /// * `w` - A mutable reference to a CSV writer. + /// + /// # Returns + /// + /// Returns a `csv::Result<()>` which is `Ok(())` if the operation was successful, + /// or an error if there was a problem reading or writing. pub fn write_headers( &self, r: &mut csv::Reader, @@ -358,11 +418,26 @@ impl Config { }) } + /// Automatically creates an index file for the CSV file. + /// + /// This function attempts to create an index file for the CSV file specified in `self.path`. + /// It's designed to fail silently if any step of the process encounters an error, as it's + /// intended to be a convenience function. + /// + /// # Behavior + /// + /// - If the file is Snappy-compressed, the function returns immediately w/o creating an index. + /// - If `self.path` is `None`, the function returns without action. + /// - The function creates an index file using `util::idx_path()` to determine index file path. + /// - It uses `csv_index::RandomAccessSimple::create()` to generate the index. + /// - If index creation is successful, it sets the `AUTO_INDEXED` atomic flag to `true`. + /// + /// # Errors + /// + /// While this function doesn't return any errors, it logs debug messages for both successful + /// and failed index creation attempts. fn autoindex_file(&self) { - // autoindex_file should never panic. It should silently fail as its a "convenience fn" - // that's why we have a lot of let-else returns, in lieu of unwraps if self.snappy { - // cannot index snappy compressed files return; } @@ -557,6 +632,28 @@ impl Config { } } +/// Determines the delimiter and compression status based on the file extension. +/// +/// # Arguments +/// +/// * `path` - A reference to the `Path` of the file. +/// * `default_delim` - The default delimiter to use if not determined by extension. +/// +/// # Returns +/// +/// A tuple containing: +/// * `String` - The lowercase file extension. +/// * `u8` - The determined delimiter. +/// * `bool` - Whether the file is Snappy-compressed. +/// +/// # Details +/// +/// This function examines the file extension to determine: +/// 1. The appropriate delimiter (tab for .tsv/.tab, semicolon for .ssv, comma for .csv). +/// 2. Whether the file is Snappy-compressed (indicated by a .sz extension). +/// 3. For Snappy-compressed files, it checks the extension before .sz to determine the delimiter. +/// +/// If the file extension doesn't match known types, it returns the default delimiter. pub fn get_delim_by_extension(path: &Path, default_delim: u8) -> (String, u8, bool) { let mut snappy = false; let file_extension = path From 5bb5eee14186073ac1fd545f2d27f2561d48536d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:19:54 -0400 Subject: [PATCH 018/119] `extdedup`: now support two modes - LINE mode and CSV mode --- src/cmd/extdedup.rs | 204 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 167 insertions(+), 37 deletions(-) diff --git a/src/cmd/extdedup.rs b/src/cmd/extdedup.rs index 2e9ae300f..f96f872ba 100644 --- a/src/cmd/extdedup.rs +++ b/src/cmd/extdedup.rs @@ -7,8 +7,14 @@ to sort the CSV first before deduping it. This allows it to run in constant memory and the output will retain the input sort order. -Also, this command is not specific to CSV data, it deduplicates any text file on a -line-by-line basis. +This command has TWO modes of operation. + + * CSV MODE + when --select is set, it dedupes based on the given column/s. See `qsv select --help` + for select syntax details. + * LINE MODE + when --select is NOT set, it deduplicates any input text file (not just CSVs) on a + line-by-line basis. A duplicate count will be sent to . @@ -17,6 +23,10 @@ Usage: qsv extdedup --help extdedup options: + -s, --select Select a subset of columns to dedup. + Note that the outputs will remain at the full width of the CSV. + If --select is NOT set, extdedup will work in LINE MODE, sorting + the input as a text file on a line-by-line basis. --no-output Do not write deduplicated output to . Use this if you only want to know the duplicate count. -D, --dupes-output Write duplicates to . @@ -25,9 +35,20 @@ extdedup options: duplicate separated by a tab from the duplicate line itself. -H, --human-readable Comma separate duplicate count. --memory-limit The maximum amount of memory to buffer the on-disk hash table. - This is a percentage of total memory. [default: 10] + If less than 50, this is a percentage of total memory. + If more than 50, this is the memory in MB to allocate, capped + at 90 percent of total memory. + [default: 10] Common options: + CSV MODE ONLY: + -n, --no-headers When set, the first row will not be interpreted + as headers. That is, it will be sorted with the rest + of the rows. Otherwise, the first row will always + appear as the header row in the output. + -d, --delimiter The field delimiter for reading CSV data. + Must be a single character. (default: ,) + -h, --help Display this message -Q, --quiet Do not print duplicate count to stderr. "#; @@ -41,17 +62,25 @@ use indicatif::HumanCount; use serde::Deserialize; use sysinfo::System; -// use sysinfo::System::sysinfo; -use crate::{config, odhtcache, util, CliResult}; +use crate::{ + config, + config::{Config, Delimiter}, + odhtcache, + select::SelectColumns, + util, CliResult, +}; #[derive(Deserialize)] struct Args { arg_input: Option, + flag_select: Option, arg_output: Option, + flag_no_headers: bool, + flag_delimiter: Option, flag_no_output: bool, flag_dupes_output: Option, flag_human_readable: bool, - flag_memory_limit: Option, + flag_memory_limit: Option, flag_quiet: bool, } @@ -60,19 +89,101 @@ const MEMORY_LIMITED_BUFFER: u64 = 100 * 1_000_000; // 100 MB pub fn run(argv: &[&str]) -> CliResult<()> { let args: Args = util::get_args(USAGE, argv)?; - // memory buffer to use for on-disk hash table, - // if we can detect the total memory, use 10% of it by default - // and up to --memory-limit (capped at 50%), - // otherwise, if we cannot detect the free memory use a default of 100 MB - let mem_limited_buffer = if sysinfo::IS_SUPPORTED_SYSTEM { - let mut sys = System::new(); - sys.refresh_memory(); - (sys.total_memory() * 1000) / u8::min(args.flag_memory_limit.unwrap_or(10), 50) as u64 + // Set the memory buffer size for the on-disk hash table based on --memory-limit + // and system capabilities. + let mem_limited_buffer_bytes = calculate_memory_limit(args.flag_memory_limit); + log::info!("{mem_limited_buffer_bytes} bytes used for memory buffer for on-disk hash table..."); + + let quiet = args.flag_quiet; + let human_readable = args.flag_human_readable; + + let dupes_count = if args.flag_select.is_some() { + dedup_csv(args, mem_limited_buffer_bytes)? } else { - MEMORY_LIMITED_BUFFER + dedup_lines(args, mem_limited_buffer_bytes)? }; - log::info!("{mem_limited_buffer} bytes used for memory buffer for on-disk hash table..."); + if quiet { + return Ok(()); + } + + eprintln!( + "{}", + if human_readable { + HumanCount(dupes_count).to_string() + } else { + dupes_count.to_string() + } + ); + + Ok(()) +} + +fn dedup_csv(args: Args, mem_limited_buffer: u64) -> Result { + let rconfig = Config::new(args.arg_input.as_ref()) + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers) + .select(args.flag_select.unwrap()); + + let mut rdr = rconfig.reader()?; + let mut wtr = Config::new(args.arg_output.as_ref()).writer()?; + let dupes_output = args.flag_dupes_output.is_some(); + let mut dupewtr = Config::new(args.flag_dupes_output.as_ref()).writer()?; + + let headers = rdr.byte_headers()?.clone(); + if dupes_output { + let mut dupe_headers = csv::ByteRecord::new(); + dupe_headers.push_field(b"dupe_rowno"); + dupe_headers.extend(headers.iter()); + dupewtr.write_byte_record(&dupe_headers)?; + } + + let mut dedup_cache = odhtcache::ExtDedupCache::new(mem_limited_buffer); + let mut dupes_count = 0_u64; + let sel = rconfig.selection(&headers)?; + + rconfig.write_headers(&mut rdr, &mut wtr)?; + + // Pre-allocate and reuse buffers + let mut key = String::with_capacity(20); + let mut utf8_string = String::with_capacity(20); + let mut dupe_row = csv::ByteRecord::new(); + let mut curr_row = csv::ByteRecord::new(); + + for (row_idx, row) in rdr.byte_records().enumerate() { + curr_row.clone_from(&row?); + key.clear(); + for field in sel.select(&curr_row) { + if let Ok(s_utf8) = simdutf8::basic::from_utf8(field) { + key.push_str(s_utf8); + } else { + utf8_string.clear(); + utf8_string.push_str(&String::from_utf8_lossy(field)); + key.push_str(&utf8_string); + } + } + + if dedup_cache.contains(&key) { + dupes_count += 1; + if dupes_output { + dupe_row.clear(); + dupe_row.push_field((row_idx + 1).to_string().as_bytes()); + dupe_row.extend(curr_row.iter()); + dupewtr.write_byte_record(&dupe_row)?; + } + } else { + dedup_cache.insert(&key); + wtr.write_byte_record(&curr_row)?; + } + } + + dupewtr.flush()?; + wtr.flush()?; + + Ok(dupes_count) +} + +fn dedup_lines(args: Args, mem_limited_buffer: u64) -> Result { let input_reader: Box = match &args.arg_input { Some(input_path) => { if input_path.to_lowercase().ends_with(".sz") { @@ -88,7 +199,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> { }, None => Box::new(io::BufReader::new(stdin().lock())), }; - let mut output_writer: Box = match &args.arg_output { Some(output_path) => Box::new(io::BufWriter::with_capacity( config::DEFAULT_WTR_BUFFER_CAPACITY, @@ -99,9 +209,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { stdout().lock(), )), }; - let mut write_dupes = false; - #[cfg(target_family = "unix")] let mut dupes_writer = if let Some(dupes_output) = args.flag_dupes_output { write_dupes = true; @@ -115,7 +223,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> { fs::File::create("/dev/null")?, ) }; - #[cfg(target_family = "windows")] let mut dupes_writer = if let Some(dupes_output) = args.flag_dupes_output { write_dupes = true; @@ -129,44 +236,67 @@ pub fn run(argv: &[&str]) -> CliResult<()> { fs::File::create("nul")?, ) }; - let mut dedup_cache = odhtcache::ExtDedupCache::new(mem_limited_buffer); - let mut dupes_count = 0_u64; - let mut line_work = String::with_capacity(100); + let mut line_work = String::with_capacity(1024); for (row_idx, line) in input_reader.lines().enumerate() { line_work.clone_from(&line?); if dedup_cache.contains(&line_work) { dupes_count += 1; if write_dupes { - dupes_writer.write_all(format!("{row_idx}\t{line_work}\n").as_bytes())?; + writeln!(dupes_writer, "{row_idx}\t{line_work}")?; } } else { dedup_cache.insert(&line_work); if args.flag_no_output { continue; } - output_writer.write_all(format!("{line_work}\n").as_bytes())?; + writeln!(output_writer, "{line_work}")?; } } - dupes_writer.flush()?; output_writer.flush()?; - if args.flag_quiet { - return Ok(()); + Ok(dupes_count) +} + +/// Determines the memory buffer size to use for on-disk hash table based on +/// the provided flag and the system's total memory. +/// +/// # Arguments +/// +/// * `flag_memory_limit` - An optional u64 value representing the user-specified memory limit. +/// +/// # Returns +/// +/// A u64 value representing the calculated memory limit in bytes. +/// +/// # Behavior +/// +/// - If the system is not supported, it returns a predefined `MEMORY_LIMITED_BUFFER` value. +/// - If `flag_memory_limit` is None, it returns the `MEMORY_LIMITED_BUFFER`. +/// - If `flag_memory_limit` is Some(limit): +/// - For limit <= 50, it's treated as a percentage of total system memory. +/// - For limit > 50, it's treated as megabytes, but capped at 90% of total system memory. +fn calculate_memory_limit(flag_memory_limit: Option) -> u64 { + if !sysinfo::IS_SUPPORTED_SYSTEM { + return MEMORY_LIMITED_BUFFER; } - eprintln!( - "{}", - if args.flag_human_readable { - HumanCount(dupes_count).to_string() - } else { - dupes_count.to_string() - } - ); + let mut sys = System::new(); + sys.refresh_memory(); + let total_memory = sys.total_memory(); - Ok(()) + #[allow(clippy::cast_precision_loss)] + match flag_memory_limit { + Some(limit) if limit <= 50 => ((total_memory as f64 * limit as f64) / 100.0) as u64, + Some(limit) => { + let limit_bytes = limit.saturating_mul(1_000_000); // Convert MB to bytes + let ninety_percent_total = (total_memory as f64 * 0.9) as u64; + std::cmp::min(limit_bytes, ninety_percent_total) + }, + None => MEMORY_LIMITED_BUFFER, + } } #[test] From f26d9dbe9df210e7644e2e65a3937dfd4688f3de Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:20:49 -0400 Subject: [PATCH 019/119] `tests`: add additional tests for `extdedup` csv mode --- tests/test_extdedup.rs | 102 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/tests/test_extdedup.rs b/tests/test_extdedup.rs index 3ab70490b..b447ecdd1 100644 --- a/tests/test_extdedup.rs +++ b/tests/test_extdedup.rs @@ -3,8 +3,8 @@ use newline_converter::dos2unix; use crate::workdir::Workdir; #[test] -fn extdedup() { - let wrk = Workdir::new("extdedup").flexible(true); +fn extdedup_linemode() { + let wrk = Workdir::new("extdedup_linemode").flexible(true); wrk.clear_contents().unwrap(); let test_file = wrk.load_test_file("boston311-100-20dupes-random.csv"); @@ -23,7 +23,7 @@ fn extdedup() { } #[test] -fn extdedup_dupesoutput() { +fn extdedup_linemode_dupesoutput() { let wrk = Workdir::new("extdedup-dupes-output").flexible(true); wrk.clear_contents().unwrap(); @@ -54,3 +54,99 @@ fn extdedup_dupesoutput() { assert_eq!(dos2unix(&dupes_output), dos2unix(&expected_output)); } + +#[test] +fn extdedupe_csvmode() { + let wrk = Workdir::new("extdedup-csvmode").flexible(true); + wrk.clear_contents().unwrap(); + + let test_file = wrk.load_test_file("boston311-100-20dupes-random.csv"); + + let mut cmd = wrk.command("extdedup"); + cmd.arg(test_file) + .arg("boston311-100-extdeduped.csv") + .args(["--select", "case_enquiry_id,open_dt,target_dt"]); + wrk.output(&mut cmd); + + // load deduped output + let deduped_output: String = wrk.from_str(&wrk.path("boston311-100-extdeduped.csv")); + + let expected_csv = wrk.load_test_resource("boston311-100-deduped.csv"); + wrk.create_from_string("boston311-100-deduped.csv", &expected_csv); + + assert_eq!(dos2unix(&deduped_output), dos2unix(&expected_csv)); + + // Check that the correct number of rows were deduplicated + let output = wrk.output(&mut cmd); + + // 20 duplicates should be removed + assert!(String::from_utf8_lossy(&output.stderr).contains("20\n")); +} + +#[test] +fn extdedupe_csvmode_dupesoutput() { + let wrk = Workdir::new("extdedup-csvmode-dupesoutput").flexible(true); + wrk.clear_contents().unwrap(); + + let test_file = wrk.load_test_file("boston311-100-20dupes-random.csv"); + + let mut cmd = wrk.command("extdedup"); + cmd.arg(test_file) + .arg("boston311-100-extdeduped.csv") + .args([ + "--select", + "case_enquiry_id,open_dt,target_dt", + "--dupes-output", + "boston311-100-extdededuped-dupeoutput.csv", + ]); + wrk.output(&mut cmd); + + // load deduped output + let deduped_output: String = wrk.from_str(&wrk.path("boston311-100-extdeduped.csv")); + + let expected_csv = wrk.load_test_resource("boston311-100-deduped.csv"); + wrk.create_from_string("boston311-100-deduped.csv", &expected_csv); + + assert_eq!(dos2unix(&deduped_output), dos2unix(&expected_csv)); + + // load dupe-output txt + let dupes_output: String = wrk.from_str(&wrk.path("boston311-100-extdededuped-dupeoutput.csv")); + + let expected_output = wrk.load_test_resource("boston311-extdedup-dupeoutput.csv"); + wrk.create_from_string("boston311-extdedup-dupeoutput.csv", &expected_output); + + assert_eq!(dos2unix(&dupes_output), dos2unix(&expected_output)); + + // Check that the correct number of rows were deduplicated + let output = wrk.output(&mut cmd); + // 20 duplicates should be removed + assert!(String::from_utf8_lossy(&output.stderr).contains("20\n")); +} + +#[test] +fn extdedupe_csvmode_neighborhood() { + let wrk = Workdir::new("extdedup-csvmode-neighborhood").flexible(true); + wrk.clear_contents().unwrap(); + + let test_file = wrk.load_test_file("boston311-100-20dupes-random.csv"); + + let mut cmd = wrk.command("extdedup"); + cmd.arg(test_file) + .arg("boston311-100-extdeduped.csv") + .args(["--select", "neighborhood"]); + wrk.output(&mut cmd); + + // load deduped output + let deduped_output: String = wrk.from_str(&wrk.path("boston311-100-extdeduped.csv")); + + let expected_csv = wrk.load_test_resource("boston311-extdedup-neighborhood.csv"); + wrk.create_from_string("boston311-extdedup-neighborhood.csv", &expected_csv); + + assert_eq!(dos2unix(&deduped_output), dos2unix(&expected_csv)); + + // Check that the correct number of rows were deduplicated + let output = wrk.output(&mut cmd); + + // 81 duplicates should be removed + assert!(String::from_utf8_lossy(&output.stderr).contains("81\n")); +} From eeefdee6bcb97ccdf44b4644f5636e65a3566747 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:24:26 -0400 Subject: [PATCH 020/119] `tests`: add extdedup csv mode test csvs --- .../test/boston311-extdedup-dupeoutput.csv | 21 +++++++++++++++++++ .../test/boston311-extdedup-neighborhood.csv | 20 ++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 resources/test/boston311-extdedup-dupeoutput.csv create mode 100644 resources/test/boston311-extdedup-neighborhood.csv diff --git a/resources/test/boston311-extdedup-dupeoutput.csv b/resources/test/boston311-extdedup-dupeoutput.csv new file mode 100644 index 000000000..11b47dd80 --- /dev/null +++ b/resources/test/boston311-extdedup-dupeoutput.csv @@ -0,0 +1,21 @@ +dupe_rowno,case_enquiry_id,open_dt,target_dt,closed_dt,ontime,case_status,closure_reason,case_title,subject,reason,type,queue,department,submittedphoto,closedphoto,location,fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,location_street_name,location_zipcode,latitude,longitude,source +40,101004154423,2022-01-31 08:05:00,,,ONTIME,Open, ,Sidewalk Cover / Manhole,Boston Water & Sewer Commission,Sidewalk Cover / Manhole,Sidewalk Cover / Manhole,INFO01_GenericeFormforOtherServiceRequestTypes,INFO,,,8 Putnam St Charlestown MA 02129,3,1A,1,A15,Charlestown,2,Ward 2,0201,8 Putnam St,02129,42.3735,-71.0599,Constituent Call +46,101004114154,2022-01-02 16:20:00,2022-01-10 08:30:00,,OVERDUE,Open, ,PWD Graffiti,Public Works Department,Highway Maintenance,PWD Graffiti,PWDx_Graffiti,PWDx,,,600 Atlantic Ave Boston MA 02210,3,1C,2,A1,Downtown / Financial District,3,Ward 3,0306,600 Atlantic Ave,02210,42.3527,-71.0536,Citizens Connect App +51,101004114795,2022-01-03 12:29:00,2022-03-07 12:29:41,,OVERDUE,Open, ,Graffiti: Ward 8 0803 ,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP,,,2000A Washington St Roxbury MA 02118,7,10B,7,D4,Roxbury,13,Ward 8,0803,2000A Washington St,02118,42.3333,-71.0797,Constituent Call +57,101004114016,2022-01-02 13:22:10,2022-01-04 08:30:00,2022-01-02 20:24:18,ONTIME,Closed,Case Closed. Closed date : Sun Jan 02 20:24:18 EST 2022 Resolved Has been cleaned up ,Requests for Street Cleaning,Public Works Department,Street Cleaning,Requests for Street Cleaning,PWDx_District 03: North Dorchester,PWDx,https://311.boston.gov/media/boston/report/photos/61d1ed4105bbcf180c2a2d66/report.jpg,,71 Willow Ct Dorchester MA 02125,6,03,2,C6,Dorchester,5,07,0708,71 Willow Ct,02125,42.3246,-71.0636,Citizens Connect App +58,101004113811,2022-01-02 08:01:29,2022-01-04 08:30:00,2022-01-03 05:59:50,ONTIME,Closed,Case Closed. Closed date : Mon Jan 03 05:59:50 EST 2022 Resolved ,CE Collection,Public Works Department,Street Cleaning,CE Collection,PWDx_District 10A: Roxbury,PWDx,,,INTERSECTION of Sunnyside St & Centre St Jamaica Plain MA ,9,10A,6,E13,Jamaica Plain,11,10,1009,INTERSECTION Sunnyside St & Centre St,,42.3594,-71.0587,City Worker App +59,101004113906,2022-01-02 10:32:35,2022-01-03 10:32:34,2022-01-03 06:44:23,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 06:44:23.4 Duplicate of Existing Case ,Traffic Signal Inspection,Transportation - Traffic Division,Signs & Signals,Traffic Signal Inspection,BTDT_Traffic Signal_Repair,BTDT,https://311.boston.gov/media/boston/report/photos/61d1c58205bbcf180c2a1816/report.jpg,,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,8,07,4,B3,Dorchester,7,17,1704,INTERSECTION Gallivan Blvd & Washington St,,42.3594,-71.0587,Citizens Connect App +62,101004114033,2022-01-02 13:38:41,2022-01-05 08:30:00,2022-01-03 07:08:35,ONTIME,Closed,Case Closed. Closed date : Mon Jan 03 07:08:35 EST 2022 Resolved No violation found at this time today is trash day. ,Improper Storage of Trash (Barrels),Public Works Department,Code Enforcement,Improper Storage of Trash (Barrels),PWDx_Code Enforcement,PWDx,https://311.boston.gov/media/boston/report/photos/61d1f12405bbcf180c2a3082/report.jpg,,INTERSECTION of Lewis St & North St Boston MA ,3,1B,1,A1,Downtown / Financial District,3,3,,INTERSECTION Lewis St & North St,,42.3594,-71.0587,Citizens Connect App +65,101004113637,2022-01-01 17:24:56,2022-01-04 08:30:00,2022-01-03 00:03:27,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 00:03:27.62 Case Resolved CLEAR ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,,,353-361 Athens St South Boston MA 02127,6,05,2,C6,South Boston / South Boston Waterfront,5,Ward 6,0604,353-361 Athens St,02127,42.3369,-71.0471,Citizens Connect App +66,101004114724,2022-01-03 11:36:21,,2022-01-04 16:31:31,ONTIME,Closed,Case Closed. Closed date : 2022-01-04 16:31:31.297 Bulk Item Automation ,Schedule Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup SS,PWDx_Schedule a Bulk Item Pickup,PWDx,,,352 Riverway Boston MA 02115,4,10A,8,B2,Mission Hill,14,Ward 10,1004,352 Riverway,02115,42.3335,-71.1113,Self Service +71,101004113512,2022-01-01 12:43:50,2022-01-31 12:43:50,2022-01-03 10:46:27,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 10:46:27.983 Case Noted BTD will investigate. Thank you for contacting 311 and BTD. ,New Sign Crosswalk or Pavement Marking,Transportation - Traffic Division,Signs & Signals,New Sign Crosswalk or Pavement Marking,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT,,,43 Oakview Ter Jamaica Plain MA 02130,9,02,6,E13,Jamaica Plain,11,Ward 19,1901,43 Oakview Ter,02130,42.3188,-71.1092,Self Service +79,101004114807,2022-01-03 12:35:00,,2022-01-10 16:30:33,ONTIME,Closed,Case Closed. Closed date : 2022-01-10 16:30:33.11 Bulk Item Automation ,Schedule a Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup,PWDx_Schedule a Bulk Item Pickup,PWDx,,,21 Ellington St Dorchester MA 02121,7,03,4,B3,Greater Mattapan,13,Ward 14,1403,21 Ellington St,02121,42.3021,-71.0844,Constituent Call +83,101004113526,2022-01-01 13:14:52,2022-01-04 08:30:00,2022-01-02 06:43:42,ONTIME,Closed,Case Closed. Closed date : Sun Jan 02 06:43:42 EST 2022 Resolved Trash removed ,CE Collection,Public Works Department,Street Cleaning,CE Collection,PWDx_District 10B: Roxbury,PWDx,,,16 Circuit St Roxbury MA 02119,7,10B,7,B2,Roxbury,13,Ward 12,1203,16 Circuit St,02119,42.3235,-71.0852,City Worker App +87,101004114108,2022-01-02 15:00:52,2022-01-04 08:30:00,2022-01-02 23:40:14,ONTIME,Closed,Case Closed. Closed date : 2022-01-02 23:40:14.32 Case Resolved CLEAR ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,https://311.boston.gov/media/boston/report/photos/61d2046805bbcf180c2a418d/report.jpg,,INTERSECTION of Nassau St & Washington St Boston MA ,4,1C,2,A1,Downtown / Financial District,4,3,0308,INTERSECTION Nassau St & Washington St,,42.3594,-71.0587,Citizens Connect App +88,101004114783,2022-01-03 12:19:00,2022-01-04 12:19:43,2022-01-03 14:05:26,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 14:05:26.86 Case Resolved Area ticketed ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,,,32 Mount Vernon St Dorchester MA 02125,6,03,2,C6,Dorchester,5,Ward 7,0709,32 Mount Vernon St,02125,42.322,-71.0573,Constituent Call +89,101004113721,2022-01-01 21:31:54,2022-01-31 21:31:54,2022-01-04 08:34:40,ONTIME,Closed,Case Closed. Closed date : Tue Jan 04 08:34:40 EST 2022 Noted Investigating area will continue monitoring. ,Rodent Activity,Inspectional Services,Environmental Services,Rodent Activity,ISD_Environmental Services (INTERNAL),ISD,,,INTERSECTION of Asticou Rd & Washington St Jamaica Plain MA ,12,02,6,E13,Jamaica Plain,11,19,1110,INTERSECTION Asticou Rd & Washington St,,42.3594,-71.0587,Citizens Connect App +91,101004113654,2022-01-01 18:07:52,2022-01-04 08:30:00,2022-01-01 19:07:41,ONTIME,Closed,Case Closed. Closed date : Sat Jan 01 19:07:41 EST 2022 Resolved Belly emptied ,Empty Litter Basket,Public Works Department,Highway Maintenance,Empty Litter Basket,PWDx_District 1B: North End,PWDx,https://311.boston.gov/media/boston/report/photos/61d0debd05bbcf180c29b2c6/report.jpg,,INTERSECTION of Prince St & Causeway St Boston MA ,3,1B,1,A1,Downtown / Financial District,3,3,0302,INTERSECTION Prince St & Causeway St,,42.3594,-71.0587,Citizens Connect App +94,101004113386,2022-01-01 09:23:39,2022-01-10 08:30:00,2022-01-01 12:56:14,ONTIME,Closed,Case Closed. Closed date : Sat Jan 01 12:56:14 EST 2022 Noted Don't believe this is a city park ,Litter / Ground Maintenance - Wellington Green (BPRD),Parks & Recreation Department,Park Maintenance & Safety,Ground Maintenance,PARK_Maintenance_Ground Maintenance,PARK,https://311.boston.gov/media/boston/report/photos/61d063e505bbcf180c297b6a/photo_20220101_092319.jpg,,563 Columbus Ave Roxbury MA 02118,4,1C,7,D4,South End,6,Ward 4,0404,563 Columbus Ave,02118,42.3412,-71.0815,Citizens Connect App +95,101004114021,2022-01-02 13:26:36,2022-01-04 08:30:00,2022-01-02 14:49:17,ONTIME,Closed,Case Closed. Closed date : Sun Jan 02 14:49:17 EST 2022 Resolved Dead rat picked up ,Pick up Dead Animal,Public Works Department,Street Cleaning,Pick up Dead Animal,PWDx_District 1B: North End,PWDx,https://311.boston.gov/media/boston/report/photos/61d1ee4b05bbcf180c2a2daf/report.jpg,,23 Charter St Boston MA 02113,3,1B,1,A1,Downtown / Financial District,3,Ward 3,0302,23 Charter St,02113,42.3668,-71.0535,Citizens Connect App +99,101004113902,2022-01-02 10:27:00,2022-01-10 08:30:00,,OVERDUE,Open, ,PWD Graffiti,Public Works Department,Highway Maintenance,PWD Graffiti,BTDT_BostonBikes,BTDT,https://311.boston.gov/media/boston/report/photos/61d1c45805bbcf180c2a17ee/report.jpg,,201 Massachusetts Ave Boston MA 02115,4,10A,7,D4,Back Bay,14,04,0405,201 Massachusetts Ave,02115,42.3452,-71.0871,Citizens Connect App +100,101004115118,2022-01-03 16:16:00,2022-01-19 16:16:48,2022-02-28 10:40:30,OVERDUE,Closed,Case Closed. Closed date : 2022-02-28 10:40:30.233 Case Noted Please resubmit with color make and plate number ,Abandoned Vehicles,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Abandoned Vehicles,BTDT_AVRS Interface Queue,BTDT,https://311.boston.gov/media/boston/report/photos/61d367b405bbcf180c2b1f49/report.jpg,,183 Orleans St East Boston MA 02128,1,09,1,A7,East Boston,1,01,0102,183 Orleans St,02128,42.3715,-71.034,Citizens Connect App diff --git a/resources/test/boston311-extdedup-neighborhood.csv b/resources/test/boston311-extdedup-neighborhood.csv new file mode 100644 index 000000000..1406e7051 --- /dev/null +++ b/resources/test/boston311-extdedup-neighborhood.csv @@ -0,0 +1,20 @@ +case_enquiry_id,open_dt,target_dt,closed_dt,ontime,case_status,closure_reason,case_title,subject,reason,type,queue,department,submittedphoto,closedphoto,location,fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,location_street_name,location_zipcode,latitude,longitude,source +101004113637,2022-01-01 17:24:56,2022-01-04 08:30:00,2022-01-03 00:03:27,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 00:03:27.62 Case Resolved CLEAR ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,,,353-361 Athens St South Boston MA 02127,6,05,2,C6,South Boston / South Boston Waterfront,5,Ward 6,0604,353-361 Athens St,02127,42.3369,-71.0471,Citizens Connect App +101004114795,2022-01-03 12:29:00,2022-03-07 12:29:41,,OVERDUE,Open, ,Graffiti: Ward 8 0803 ,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP,,,2000A Washington St Roxbury MA 02118,7,10B,7,D4,Roxbury,13,Ward 8,0803,2000A Washington St,02118,42.3333,-71.0797,Constituent Call +101004114783,2022-01-03 12:19:00,2022-01-04 12:19:43,2022-01-03 14:05:26,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 14:05:26.86 Case Resolved Area ticketed ,Parking Enforcement,Transportation - Traffic Division,Enforcement & Abandoned Vehicles,Parking Enforcement,BTDT_Parking Enforcement,BTDT,,,32 Mount Vernon St Dorchester MA 02125,6,03,2,C6,Dorchester,5,Ward 7,0709,32 Mount Vernon St,02125,42.322,-71.0573,Constituent Call +101004141367,2022-01-20 08:15:45,2022-01-21 08:30:00,2022-01-20 08:45:12,ONTIME,Closed,Case Closed. Closed date : Thu Jan 20 08:45:12 EST 2022 Noted ,CE Collection,Public Works Department,Street Cleaning,CE Collection,PWDx_District 1B: North End,PWDx,,,12 Derne St Boston MA 02114,3,1B,1,A1,Beacon Hill,3,Ward 3,0306,12 Derne St,02114,42.3596,-71.0634,City Worker App +101004114154,2022-01-02 16:20:00,2022-01-10 08:30:00,,OVERDUE,Open, ,PWD Graffiti,Public Works Department,Highway Maintenance,PWD Graffiti,PWDx_Graffiti,PWDx,,,600 Atlantic Ave Boston MA 02210,3,1C,2,A1,Downtown / Financial District,3,Ward 3,0306,600 Atlantic Ave,02210,42.3527,-71.0536,Citizens Connect App +101004113822,2022-01-02 08:15:00,2022-01-10 08:30:00,2022-01-05 10:37:03,ONTIME,Closed,Case Closed. Closed date : Wed Jan 05 10:37:03 EST 2022 Resolved ,Electrical,Inspectional Services,Building,Electrical,ISD_Building (INTERNAL),ISD,,,156 Everett St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0101,156 Everett St,02128,42.3666,-71.0323,Constituent Call +101004113313,2022-01-01 01:56:00,,,ONTIME,Open, ,Loud Parties/Music/People,Boston Police Department,Noise Disturbance,Loud Parties/Music/People,INFO01_GenericeFormforOtherServiceRequestTypes,INFO,,,755 Boylston St Boston MA 02116,4,1C,8,D4,Back Bay,14,Ward 5,0508,755 Boylston St,02116,42.3494,-71.0811,Constituent Call +101004114624,2022-01-03 10:12:00,2022-05-03 10:12:36,2022-01-13 14:12:46,ONTIME,Closed,Case Closed. Closed date : Thu Jan 13 14:12:46 EST 2022 Noted Violations found. Notice written. ,SCHEDULED Pest Infestation - Residential,Inspectional Services,Housing,Pest Infestation - Residential,ISD_Housing (INTERNAL),ISD,,,20 Washington St Brighton MA 02135,11,04,9,D14,Allston / Brighton,15,Ward 21,2112,20 Washington St,02135,42.3425,-71.1412,Constituent Call +101004114608,2022-01-03 10:02:57,2022-05-03 10:02:57,2022-01-03 10:11:24,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 10:11:24.96 Case Invalid ,Pest Infestation - Residential,Inspectional Services,Housing,Pest Infestation - Residential,ISD_Housing (INTERNAL),ISD,,,20 Washington St Hyde Park MA 02136,12,08,5,E18,Hyde Park,10,18,1817,20 Washington St,02136,42.3594,-71.0587,Constituent Call +101004120108,2022-01-08 12:54:49,2022-01-11 08:30:00,2022-01-09 06:43:06,ONTIME,Closed,Case Closed. Closed date : Sun Jan 09 06:43:06 EST 2022 Noted ,CE Collection,Public Works Department,Street Cleaning,CE Collection,PWDx_District 1C: Downtown,PWDx,,,198 W Springfield St Roxbury MA 02118,4,1C,7,D4,South End,6,Ward 9,0902,198 W Springfield St,02118,42.3401,-71.0803,City Worker App +101004113512,2022-01-01 12:43:50,2022-01-31 12:43:50,2022-01-03 10:46:27,ONTIME,Closed,Case Closed. Closed date : 2022-01-03 10:46:27.983 Case Noted BTD will investigate. Thank you for contacting 311 and BTD. ,New Sign Crosswalk or Pavement Marking,Transportation - Traffic Division,Signs & Signals,New Sign Crosswalk or Pavement Marking,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT,,,43 Oakview Ter Jamaica Plain MA 02130,9,02,6,E13,Jamaica Plain,11,Ward 19,1901,43 Oakview Ter,02130,42.3188,-71.1092,Self Service +101004114807,2022-01-03 12:35:00,,2022-01-10 16:30:33,ONTIME,Closed,Case Closed. Closed date : 2022-01-10 16:30:33.11 Bulk Item Automation ,Schedule a Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup,PWDx_Schedule a Bulk Item Pickup,PWDx,,,21 Ellington St Dorchester MA 02121,7,03,4,B3,Greater Mattapan,13,Ward 14,1403,21 Ellington St,02121,42.3021,-71.0844,Constituent Call +101004113747,2022-01-01 23:46:09,2022-01-17 08:30:00,2022-01-02 11:03:10,ONTIME,Closed,Case Closed. Closed date : Sun Jan 02 11:03:10 EST 2022 Noted Case noted. Duplicate case. Posts already marked for contractor to repair. ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,https://311.boston.gov/media/boston/report/photos/61d12e0705bbcf180c29cfc2/report.jpg,,103 N Beacon St Brighton MA 02135,11,04,9,D14,Brighton,15,22,2205,103 N Beacon St,02135,42.3549,-71.143,Citizens Connect App +101004154423,2022-01-31 08:05:00,,,ONTIME,Open, ,Sidewalk Cover / Manhole,Boston Water & Sewer Commission,Sidewalk Cover / Manhole,Sidewalk Cover / Manhole,INFO01_GenericeFormforOtherServiceRequestTypes,INFO,,,8 Putnam St Charlestown MA 02129,3,1A,1,A15,Charlestown,2,Ward 2,0201,8 Putnam St,02129,42.3735,-71.0599,Constituent Call +101004115093,2022-01-03 16:06:33,,2022-01-04 08:15:58,ONTIME,Closed,Case Closed. Closed date : 2022-01-04 08:15:58.1 Case Invalid This case has been closed as there is not enough information to process this request. If you feel this has been closed in error please dial 311 to submit a new request. Sincerely Boston 311 Team ,City/State Snow Issues,Mayor's 24 Hour Hotline,Programs,City/State Snow Issues,INFO01_GenericeFormforOtherServiceRequestTypes,INFO,,,40 Battery St Boston MA 02109,3,1B,1,A1,Boston,3,03,0301,40 Battery St,02109,42.3594,-71.0587,Constituent Call +101004114724,2022-01-03 11:36:21,,2022-01-04 16:31:31,ONTIME,Closed,Case Closed. Closed date : 2022-01-04 16:31:31.297 Bulk Item Automation ,Schedule Bulk Item Pickup,Public Works Department,Sanitation,Schedule a Bulk Item Pickup SS,PWDx_Schedule a Bulk Item Pickup,PWDx,,,352 Riverway Boston MA 02115,4,10A,8,B2,Mission Hill,14,Ward 10,1004,352 Riverway,02115,42.3335,-71.1113,Self Service +101004114391,2022-01-03 08:00:00,2022-01-04 08:30:00,2022-01-03 08:36:14,ONTIME,Closed,Case Closed. Closed date : Mon Jan 03 08:36:14 EST 2022 Resolved Been removed ,Requests for Street Cleaning,Public Works Department,Street Cleaning,Requests for Street Cleaning,PWDx_District 06: West Roxbury and Roslindale,PWDx,,,2432 Centre St West Roxbury MA 02132,12,06,6,E5,West Roxbury,12,Ward 20,2015,2432 Centre St,02132,42.2674,-71.1626,Constituent Call +101004114656,2022-01-03 10:43:00,2022-01-24 10:43:43,2022-01-18 08:00:19,ONTIME,Closed,Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Request for Recycling Cart,Public Works Department,Recycling,Request for Recycling Cart,PWDx_Recycling Sent to Contractor,PWDx,,,49 Westbourne St Roslindale MA 02131,12,06,5,E5,Roslindale,10,Ward 20,2009,49 Westbourne St,02131,42.2821,-71.1415,Constituent Call +101004143000,2022-01-21 13:47:00,2022-02-04 13:47:30,,OVERDUE,Open, ,BTDT: Complaint,Mayor's 24 Hour Hotline,Employee & General Comments,General Comments For a Program or Policy,BTDT_Parking Enforcement,BTDT,,, , , , , , , , , ,,,42.3594,-71.0587,Constituent Call From fc1e756f0a685f8b3ccd79b48a5dcbe60da080d6 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 19:43:13 -0400 Subject: [PATCH 021/119] `extdedup`: use itoa for faster integer to string conversion --- src/cmd/extdedup.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/extdedup.rs b/src/cmd/extdedup.rs index f96f872ba..f0b1abe5f 100644 --- a/src/cmd/extdedup.rs +++ b/src/cmd/extdedup.rs @@ -167,7 +167,7 @@ fn dedup_csv(args: Args, mem_limited_buffer: u64) -> Result Date: Sat, 12 Oct 2024 20:22:00 -0400 Subject: [PATCH 022/119] `docs`: `extdedup` has selector support --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ce8b7577..573d23ac5 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ | [excel](/src/cmd/excel.rs#L2)
🚀 | Exports a specified Excel/ODS sheet to a CSV file. | | [exclude](/src/cmd/exclude.rs#L2)
📇👆 | Removes a set of CSV data from another set based on the specified columns. | | [explode](/src/cmd/explode.rs#L2)
🔣👆 | Explode rows into multiple ones by splitting a column value based on the given separator. | -| [extdedup](/src/cmd/extdedup.rs#L2)
| Remove duplicate rows from an arbitrarily large CSV/text file using a memory-mapped, [on-disk hash table](https://crates.io/crates/odht). Unlike the `dedup` command, this command does not load the entire file into memory nor does it sort the deduped file. | +| [extdedup](/src/cmd/extdedup.rs#L2)
👆 | Remove duplicate rows from an arbitrarily large CSV/text file using a memory-mapped, [on-disk hash table](https://crates.io/crates/odht). Unlike the `dedup` command, this command does not load the entire file into memory nor does it sort the deduped file. | | [extsort](/src/cmd/extsort.rs#L2)
🚀 | Sort an arbitrarily large CSV/text file using a multithreaded [external merge sort](https://en.wikipedia.org/wiki/External_sorting) algorithm. | | [fetch](/src/cmd/fetch.rs#L3)
✨🧠🌐 | Fetches data from web services for every row using **HTTP Get**. Comes with [HTTP/2](https://http2-explained.haxx.se/en/part1) [adaptive flow control](https://medium.com/coderscorner/http-2-flow-control-77e54f7fd518), [jql](https://github.com/yamafaktory/jql#%EF%B8%8F-usage) JSON query language support, dynamic throttling ([RateLimit](https://www.ietf.org/archive/id/draft-ietf-httpapi-ratelimit-headers-06.html)) & caching with available persistent caching using [Redis](https://redis.io/) or a disk-cache. | | [fetchpost](/src/cmd/fetchpost.rs#L3)
✨🧠🌐 | Similar to `fetch`, but uses **HTTP Post**. ([HTTP GET vs POST methods](https://www.geeksforgeeks.org/difference-between-http-get-and-post-methods/)) | From be0a35d42cc88089e48fb32346bf964d33a0f3f1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 13 Oct 2024 00:26:54 +0000 Subject: [PATCH 023/119] build(deps): bump flexi_logger from 0.29.2 to 0.29.3 Bumps [flexi_logger](https://github.com/emabee/flexi_logger) from 0.29.2 to 0.29.3. - [Changelog](https://github.com/emabee/flexi_logger/blob/main/CHANGELOG.md) - [Commits](https://github.com/emabee/flexi_logger/commits) --- updated-dependencies: - dependency-name: flexi_logger dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 876aa0a98..0da0b2ec1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2424,15 +2424,14 @@ dependencies = [ [[package]] name = "flexi_logger" -version = "0.29.2" +version = "0.29.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c6e500462d7c5ee8b974423b55bd47f3e09c8306050e5bbeaccaf2b17992f70" +checksum = "719236bdbcf6033a3395165f797076b31056018e6723ccff616eb25fc9c99de1" dependencies = [ "chrono", "crossbeam-channel", "crossbeam-queue", "flate2", - "glob", "log", "thiserror", ] From 57180756a5d260e22d5f796bff28000f5e51110d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 20:39:55 -0400 Subject: [PATCH 024/119] `deps`: use latest polars upstream; update lock file --- Cargo.lock | 47 ++++++++++++++++++++++++----------------------- Cargo.toml | 4 ++-- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0da0b2ec1..cced2442b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1168,9 +1168,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.29" +version = "1.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58e804ac3194a48bb129643eb1d62fcc20d18c6b8c181704489353d13120bcd1" +checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" dependencies = [ "jobserver", "libc", @@ -2447,9 +2447,9 @@ dependencies = [ [[package]] name = "fluent-uri" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7bd399b64ddd63a83cf40512c96007dafe9ac26cfc8c89c820a247c6f7d2376" +checksum = "1918b65d96df47d3591bed19c5cca17e3fa5d0707318e4b5ef2eae01764df7e5" dependencies = [ "borrow-or-share", "ref-cast", @@ -4749,7 +4749,7 @@ dependencies = [ [[package]] name = "polars" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "getrandom", "polars-arrow", @@ -4768,7 +4768,7 @@ dependencies = [ [[package]] name = "polars-arrow" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "atoi", @@ -4816,7 +4816,7 @@ dependencies = [ [[package]] name = "polars-compute" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "bytemuck", "either", @@ -4831,7 +4831,7 @@ dependencies = [ [[package]] name = "polars-core" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4865,7 +4865,7 @@ dependencies = [ [[package]] name = "polars-error" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "avro-schema", "object_store", @@ -4878,10 +4878,11 @@ dependencies = [ [[package]] name = "polars-expr" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "bitflags 2.6.0", + "num-traits", "once_cell", "polars-arrow", "polars-compute", @@ -4897,7 +4898,7 @@ dependencies = [ [[package]] name = "polars-io" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "async-trait", @@ -4943,7 +4944,7 @@ dependencies = [ [[package]] name = "polars-json" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "chrono", @@ -4964,7 +4965,7 @@ dependencies = [ [[package]] name = "polars-lazy" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4990,7 +4991,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "futures", "memmap2", @@ -5011,7 +5012,7 @@ dependencies = [ [[package]] name = "polars-ops" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "argminmax", @@ -5044,7 +5045,7 @@ dependencies = [ [[package]] name = "polars-parquet" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "async-stream", @@ -5072,7 +5073,7 @@ dependencies = [ [[package]] name = "polars-pipe" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -5098,7 +5099,7 @@ dependencies = [ [[package]] name = "polars-plan" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "bitflags 2.6.0", @@ -5131,7 +5132,7 @@ dependencies = [ [[package]] name = "polars-row" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "bytemuck", "polars-arrow", @@ -5142,7 +5143,7 @@ dependencies = [ [[package]] name = "polars-schema" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "indexmap", "polars-error", @@ -5154,7 +5155,7 @@ dependencies = [ [[package]] name = "polars-sql" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "hex", "once_cell", @@ -5175,7 +5176,7 @@ dependencies = [ [[package]] name = "polars-time" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "atoi", "bytemuck", @@ -5195,7 +5196,7 @@ dependencies = [ [[package]] name = "polars-utils" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ee9bafb#ee9bafbdef7d62baa06d469f42e6cec0755eb544" +source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" dependencies = [ "ahash", "bytemuck", diff --git a/Cargo.toml b/Cargo.toml index a2c452984..93ea67ee3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -314,9 +314,9 @@ governor = { git = "https://github.com/jqnatividad/governor", branch = "deps-bum # BUILD NOTE: Be sure to set QSV_POLARS_REV below to the latest commit short hash or tag # of polars/py-polars before building qsv. This allows us to show the polars rev/tag in --version. # if we are using a release version of Rust Polars, leave QSV_POLARS_REV empty -# QSV_POLARS_REV=ee9bafb +# QSV_POLARS_REV=48f6e9d # polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.9.0" } -polars = { git = "https://github.com/pola-rs/polars", rev = "ee9bafb" } +polars = { git = "https://github.com/pola-rs/polars", rev = "48f6e9d" } [features] default = ["mimalloc"] From e3d5066b908c9576865b5874c5b3ba77d142d0aa Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 21:33:01 -0400 Subject: [PATCH 025/119] `validate`: bump jsonschema from 0.22 to 0.23 and adapt to new API --- Cargo.lock | 10 +++++---- Cargo.toml | 5 +---- src/cmd/validate.rs | 54 ++++++++++++++++++--------------------------- 3 files changed, 28 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cced2442b..a5cf98eb1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3513,8 +3513,9 @@ dependencies = [ [[package]] name = "jsonschema" -version = "0.22.3" -source = "git+https://github.com/Stranger6667/jsonschema-rs?rev=eb630ef#eb630ef1f2d8508b672b039a27e2708ac943a659" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea85509e7320309cc8be62d8badb46f9525157bdc748bf2ec089cd4083a3f7e" dependencies = [ "ahash", "anyhow", @@ -5970,8 +5971,9 @@ dependencies = [ [[package]] name = "referencing" -version = "0.22.3" -source = "git+https://github.com/Stranger6667/jsonschema-rs?rev=eb630ef#eb630ef1f2d8508b672b039a27e2708ac943a659" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bdf02a06a0820fcb9ce064715b424f1cdd79e24f991990a92425afe11eaf4a" dependencies = [ "ahash", "fluent-uri", diff --git a/Cargo.toml b/Cargo.toml index 93ea67ee3..e0e0ab898 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -131,7 +131,7 @@ jaq-interpret = "1.5.0" jaq-parse = "1.0.3" jemallocator = { version = "0.5", optional = true } json-objects-to-csv = "0.1.3" -jsonschema = { version = "0.22", features = [ +jsonschema = { version = "0.23", features = [ "resolve-file", "resolve-http", ], default-features = false } @@ -293,9 +293,6 @@ grex = { git = "https://github.com/pemistahl/grex", rev = "ff8533d" } # use modernized version of local_encoding local-encoding = { git = "https://github.com/slonopotamus/local-encoding-rs", branch = "travis-madness" } -# use jsonschema with unreleased perf improvements & fixes -jsonschema = { git = "https://github.com/Stranger6667/jsonschema-rs", rev = "eb630ef" } - # use of fork of xlsxwriter with bumped bindgen dependency xlsxwriter = { git = "https://github.com/jqnatividad/xlsxwriter-rs", branch = "bump-bindgen-to-0.70.1" } diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index e87f4911b..d459345c0 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -149,7 +149,7 @@ use indicatif::HumanCount; use indicatif::{ProgressBar, ProgressDrawTarget}; use jsonschema::{ output::BasicOutput, - paths::{JsonPointer, JsonPointerNode, PathChunk}, + paths::{LazyLocation, Location}, ErrorIterator, Keyword, ValidationError, Validator, }; use log::{debug, info, log_enabled}; @@ -179,8 +179,8 @@ macro_rules! fail_validation_error { let err = format!($($t)*); error!("{err}"); Err(ValidationError::custom( - JsonPointer::default(), - JsonPointer::default(), + Location::default(), + Location::default(), &Value::Null, err, )) @@ -260,13 +260,13 @@ impl Keyword for DynEnumValidator { fn validate<'instance>( &self, instance: &'instance Value, - instance_path: &JsonPointerNode, + instance_path: &LazyLocation, ) -> ErrorIterator<'instance> { if self.dynenum_set.contains(instance.as_str().unwrap()) { Box::new(std::iter::empty()) } else { let error = ValidationError::custom( - JsonPointer::default(), + Location::default(), instance_path.into(), instance, format!("{instance} is not a valid dynamicEnum value"), @@ -289,7 +289,7 @@ impl Keyword for DynEnumValidator { fn dyn_enum_validator_factory<'a>( _parent: &'a Map, value: &'a Value, - jsonpointer: JsonPointer, + location: Location, ) -> Result, ValidationError<'a>> { if let Value::String(uri) = value { let temp_download = NamedTempFile::new()?; @@ -340,8 +340,8 @@ fn dyn_enum_validator_factory<'a>( Ok(Box::new(DynEnumValidator::new(enum_set))) } else { Err(ValidationError::custom( - JsonPointer::default(), - jsonpointer, + Location::default(), + location, value, "'dynamicEnum' must be set to a CSV file on the local filesystem or on a URL.", )) @@ -1164,34 +1164,22 @@ fn validate_json_instance( instance: &Value, schema_compiled: &Validator, ) -> Option> { - let validation_output = schema_compiled.apply(instance); - - // If validation output is Invalid, then grab field names and errors - if validation_output.flag() { - None - } else { - // get validation errors as String - let validation_errors: Vec<(String, String)> = match validation_output.basic() { - BasicOutput::Invalid(errors) => errors + match schema_compiled.apply(instance).basic() { + BasicOutput::Valid(_) => None, + BasicOutput::Invalid(errors) => Some( + errors .iter() .map(|e| { - if let Some(PathChunk::Property(box_str)) = e.instance_location().last() { - (box_str.to_string(), e.error_description().to_string()) - } else { - ( - e.instance_location().to_string(), - e.error_description().to_string(), - ) - } + ( + e.instance_location() + .to_string() + .trim_start_matches('/') + .to_owned(), + e.error_description().to_string(), + ) }) .collect(), - BasicOutput::Valid(_annotations) => { - // shouldn't happen - unreachable!("Unexpected error."); - }, - }; - - Some(validation_errors) + ), } } @@ -1551,7 +1539,7 @@ fn test_dyn_enum_validator() { let err_info = e.into_iter().next().unwrap(); assert_eq!( format!("{err_info:?}"), - r#"ValidationError { instance: String("lanzones"), kind: Custom { message: "\"lanzones\" is not a valid dynamicEnum value" }, instance_path: JsonPointer([]), schema_path: JsonPointer([]) }"# + r#"ValidationError { instance: String("lanzones"), kind: Custom { message: "\"lanzones\" is not a valid dynamicEnum value" }, instance_path: Location(""), schema_path: Location("") }"# ); } else { unreachable!("Expected an error, but validation succeeded."); From fe780090b9be80e2029c638f84ffb3eeb7910ce3 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 23:19:43 -0400 Subject: [PATCH 026/119] `validate`: refactor to_json_instance - non-utf8 values no longer cause an error, the lossy string is used instead - push into Map at the end --- src/cmd/validate.rs | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index d459345c0..be00f8896 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -130,7 +130,6 @@ Common options: "#; use std::{ - borrow::Cow, env, fs::File, io::{BufReader, BufWriter, Read, Write}, @@ -920,59 +919,53 @@ fn to_json_instance( ) -> CliResult { let mut json_object_map: Map = Map::with_capacity(header_len); - let mut key_string: String; + let mut lossy_string; for ((key, json_type), value) in header_types.iter().zip(record.iter()) { - key_string = key.to_owned(); - if value.is_empty() { - json_object_map.insert(key_string, Value::Null); + json_object_map.insert(key.clone(), Value::Null); continue; } let value_str = if let Ok(v) = simdutf8::basic::from_utf8(value) { - Cow::Borrowed(v) + v } else { - let s = String::from_utf8_lossy(value); - return fail_encoding_clierror!("CSV value \"{s}\" is not valid UTF-8"); + lossy_string = String::from_utf8_lossy(value).to_string(); + &lossy_string }; - match *json_type { - JSONtypes::String => { - json_object_map.insert(key_string, Value::String(value_str.into_owned())); - }, + let json_value = match json_type { + JSONtypes::String => Value::String(value_str.to_owned()), JSONtypes::Number => { if let Ok(float) = value_str.parse::() { - json_object_map - .insert(key_string, Value::Number(Number::from_f64(float).unwrap())); + Value::Number(Number::from_f64(float).unwrap()) } else { return fail_clierror!( - "Can't cast into Number. key: {key_string}, value: {value_str}" + "Can't cast into Number. key: {key}, value: {value_str}" ); } }, JSONtypes::Integer => { if let Ok(int) = atoi_simd::parse::(value_str.as_bytes()) { - json_object_map.insert(key_string, Value::Number(Number::from(int))); + Value::Number(Number::from(int)) } else { return fail_clierror!( - "Can't cast into Integer. key: {key_string}, value: {value_str}" + "Can't cast into Integer. key: {key}, value: {value_str}" ); } }, JSONtypes::Boolean => { if let Ok(boolean) = value_str.parse::() { - json_object_map.insert(key_string, Value::Bool(boolean)); + Value::Bool(boolean) } else { return fail_clierror!( - "Can't cast into Boolean. key: {key_string}, value: {value_str}" + "Can't cast into Boolean. key: {key}, value: {value_str}" ); } }, - JSONtypes::Unsupported => { - // unreachable because we assigned JSONtypes - unreachable!("we should never get an unsupported JSON type"); - }, - } + JSONtypes::Unsupported => unreachable!("we should never get an unsupported JSON type"), + }; + + json_object_map.insert(key.clone(), json_value); } Ok(Value::Object(json_object_map)) From a8d5718d257b881994ba795c77b921b086a8cf76 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 12 Oct 2024 23:28:02 -0400 Subject: [PATCH 027/119] `docs`: update `validate` throughput [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 573d23ac5..fa383a58a 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ | [to](/src/cmd/to.rs#L2)
✨🚀🗄️ | Convert CSV files to [PostgreSQL](https://www.postgresql.org), [SQLite](https://www.sqlite.org/index.html), XLSX and [Data Package](https://datahub.io/docs/data-packages/tabular). | | [tojsonl](/src/cmd/tojsonl.rs#L3)
📇😣🚀🔣🪄 | Smartly converts CSV to a newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)). By scanning the CSV first, it "smartly" infers the appropriate JSON data type for each column. See `jsonl` command to convert JSONL to CSV. | | [transpose](/src/cmd/transpose.rs#L2)
🤯 | Transpose rows/columns of a CSV. | -| [validate](/src/cmd/validate.rs#L2)
📇🚀🌐 | Validate CSV data [_blazingly-fast_](https://github.com/Stranger6667/jsonschema-rs?tab=readme-ov-file#performance "using jsonschema-rs - the fastest JSON Schema validator for Rust") using [JSON Schema Validation (Draft 2020-12)](https://json-schema.org/draft/2020-12/json-schema-validation.html) (e.g. _up to 761,035 rows/second_[^1] using [NYC's 311 schema](https://github.com/jqnatividad/qsv/blob/master/resources/test/311_Service_Requests_from_2010_to_Present-2022-03-04.csv.schema.json) generated by the [`schema`](#schema_deeplink) command) & put invalid records into a separate file with an accompanying detailed validation error report file.
Supports a custom `currency` format with [ISO-4217](https://en.wikipedia.org/wiki/ISO_4217) validation, and a custom `dynamicEnum` keyword that supports enum validation against a CSV on the filesystem or on a URL.
If no JSON schema file is provided, validates if a CSV conforms to the [RFC 4180 standard](#rfc-4180-csv-standard) and is UTF-8 encoded. | +| [validate](/src/cmd/validate.rs#L2)
📇🚀🌐 | Validate CSV data [_blazingly-fast_](https://github.com/Stranger6667/jsonschema-rs?tab=readme-ov-file#performance "using jsonschema-rs - the fastest JSON Schema validator for Rust") using [JSON Schema Validation (Draft 2020-12)](https://json-schema.org/draft/2020-12/json-schema-validation.html) (e.g. _up to 780,031 rows/second_[^1] using [NYC's 311 schema](https://github.com/jqnatividad/qsv/blob/master/resources/test/311_Service_Requests_from_2010_to_Present-2022-03-04.csv.schema.json) generated by the [`schema`](#schema_deeplink) command) & put invalid records into a separate file with an accompanying detailed validation error report file.
Supports a custom `currency` format with [ISO-4217](https://en.wikipedia.org/wiki/ISO_4217) validation, and a custom `dynamicEnum` keyword that supports enum validation against a CSV on the filesystem or on a URL.
If no JSON schema file is provided, validates if a CSV conforms to the [RFC 4180 standard](#rfc-4180-csv-standard) and is UTF-8 encoded. |
Performance metrics compiled on an M2 Pro 12-core Mac Mini with 32gb RAM
From 18b08d950642dc0dc74d0c8cadedd34f7363ef6d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 07:30:11 -0400 Subject: [PATCH 028/119] `deps`: set arboard default-features = false we don't need image support --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e0e0ab898..4fdd14fd3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,7 +69,7 @@ panic = "abort" [dependencies] ahash = "0.8" -arboard = "3.4.1" +arboard = { version = "3.4.1", default-features = false } arrow = { version = "53", default-features = false, features = [ "csv", ], optional = true } From ed80f21533ebd219e9ec746915194cfffb7721ea Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 07:30:57 -0400 Subject: [PATCH 029/119] `test`: enable clipboard test --- tests/test_clipboard.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_clipboard.rs b/tests/test_clipboard.rs index 7f75f6a06..faa12ca76 100644 --- a/tests/test_clipboard.rs +++ b/tests/test_clipboard.rs @@ -1,14 +1,12 @@ use crate::workdir::Workdir; +use arboard::Clipboard; -// Assume a user has qsv stats output in their clipboard. -// This test compares the stats output of fruits.csv to the clipboard output. #[test] -#[ignore = "Requires clipboard to test."] +// #[ignore = "Requires clipboard to test."] fn clipboard_success() { let wrk = Workdir::new("stats_clipboard_equality"); - let mut clipboard_cmd = wrk.command("clipboard"); - let clipboard_output: String = wrk.stdout(&mut clipboard_cmd); + // This test compares the stats output of fruits.csv to the clipboard output. #[cfg(not(windows))] let expected = "field,type,is_ascii,sum,min,max,range,min_length,max_length,mean,sem,stddev,\ variance,cv,nullcount,max_precision,sparsity\nfruit,String,true,,apple,\ @@ -20,5 +18,11 @@ fn clipboard_success() { strawberry,,5,10,,,,,,0,,0\r\nprice,Float,,7,1.5,3.0,1.5,4,4,2.3333,0.36,0.\ 6236,0.3889,26.7261,0,1,0"; + let mut clipboard = Clipboard::new().unwrap(); + clipboard.set_text(expected).unwrap(); + + let mut clipboard_cmd = wrk.command("clipboard"); + let clipboard_output: String = wrk.stdout(&mut clipboard_cmd); + assert_eq!(clipboard_output, expected); } From 58cc47438b12b070611fcd643da8d29813061b3d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 07:32:16 -0400 Subject: [PATCH 030/119] rustfmt --- tests/test_clipboard.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_clipboard.rs b/tests/test_clipboard.rs index faa12ca76..83669108d 100644 --- a/tests/test_clipboard.rs +++ b/tests/test_clipboard.rs @@ -1,6 +1,7 @@ -use crate::workdir::Workdir; use arboard::Clipboard; +use crate::workdir::Workdir; + #[test] // #[ignore = "Requires clipboard to test."] fn clipboard_success() { From b3f119a54eef31b89ca1f6656180b705235191b8 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 07:44:45 -0400 Subject: [PATCH 031/119] `tests`: only enable clipboard test on platforms with a built-in clipboard --- tests/tests.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/tests.rs b/tests/tests.rs index ea5d0a102..a446e9f88 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -39,7 +39,10 @@ mod test_applydp; mod test_behead; #[cfg(any(feature = "feature_capable", feature = "lite"))] mod test_cat; -#[cfg(any(feature = "feature_capable", feature = "lite"))] +#[cfg(all( + any(feature = "feature_capable", feature = "lite"), + any(target_os = "windows", target_os = "macos") +))] mod test_clipboard; mod test_combos; mod test_comments; From 32d196bc5193b995281d7f35b03283cb54b175a8 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 08:20:34 -0400 Subject: [PATCH 032/119] `validate`: do location trimming of leading forward slash in `do_json_validation` when creating the error records. In CSVs, JSON path does not make sense as its flat. Trim away the leading forward slash. Doing it this way also saves an allocation per error record --- src/cmd/validate.rs | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index be00f8896..ef152f1ff 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -903,7 +903,10 @@ fn do_json_validation( .iter() .map(|(field, error)| { // validation error file format: row_number, field, error - format!("{row_number_string}\t{field}\t{error}") + format!( + "{row_number_string}\t{field}\t{error}", + field = field.trim_start_matches('/') + ) }) .collect::>() .join("\n") @@ -1164,10 +1167,7 @@ fn validate_json_instance( .iter() .map(|e| { ( - e.instance_location() - .to_string() - .trim_start_matches('/') - .to_owned(), + e.instance_location().to_string(), e.error_description().to_string(), ) }) @@ -1303,7 +1303,7 @@ mod tests_for_schema_validation { assert_eq!( vec![( - "name".to_string(), + "/name".to_string(), "\"X\" is shorter than 2 characters".to_string() )], result.unwrap() @@ -1374,7 +1374,7 @@ fn test_validate_currency_email_dynamicenum_validator() { assert_eq!( result, Some(vec![( - "fee".to_owned(), + "/fee".to_owned(), "\"Ð 100.00\" is not a \"currency\"".to_owned() )]) ); @@ -1403,11 +1403,11 @@ fn test_validate_currency_email_dynamicenum_validator() { result, Some(vec![ ( - "fee".to_owned(), + "/fee".to_owned(), "\"Ð 100.00\" is not a \"currency\"".to_owned() ), ( - "email".to_owned(), + "/email".to_owned(), "\"thisisnotanemail\" is not a \"email\"".to_owned() ) ]) @@ -1449,11 +1449,11 @@ fn test_validate_currency_email_dynamicenum_validator() { result, Some(vec![ ( - "name".to_owned(), + "/name".to_owned(), "\"T\" is shorter than 2 characters".to_owned() ), ( - "agency".to_owned(), + "/agency".to_owned(), "\"MODA\" is not a valid dynamicEnum value".to_owned() ) ]) @@ -1461,7 +1461,7 @@ fn test_validate_currency_email_dynamicenum_validator() { 4 => assert_eq!( result, Some(vec![( - "name".to_owned(), + "/name".to_owned(), "\"X\" is shorter than 2 characters".to_owned() )]) ), @@ -1469,14 +1469,14 @@ fn test_validate_currency_email_dynamicenum_validator() { 6 => assert_eq!( result, Some(vec![( - "agency".to_owned(), + "/agency".to_owned(), "\"NYFD\" is not a valid dynamicEnum value".to_owned() )]) ), 7 => assert_eq!( result, Some(vec![( - "fee".to_owned(), + "/fee".to_owned(), "\"WAX 100.000,00\" is not a \"currency\"".to_owned() )]) ), @@ -1484,15 +1484,15 @@ fn test_validate_currency_email_dynamicenum_validator() { result, Some(vec![ ( - "fee".to_owned(), + "/fee".to_owned(), "\"B 1,000,000\" is not a \"currency\"".to_owned() ), ( - "email".to_owned(), + "/email".to_owned(), "\"71076.964-compuserve\" is not a \"email\"".to_owned() ), ( - "agency".to_owned(), + "/agency".to_owned(), "\"ABCD\" is not a valid dynamicEnum value".to_owned() ) ]) From e848d184ef6b6131e04e5ffa853b5794f2118b0e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 08:23:02 -0400 Subject: [PATCH 033/119] `deps`: bump `bytemuck` from 1.18 to 1.19; enable `latest_stable_rust` feature should squeeze out a bit more perf as qsv is always on the latest stable rust --- Cargo.lock | 5 +++-- Cargo.toml | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a5cf98eb1..af295bc03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1029,9 +1029,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bytemuck" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94bbb0ad554ad961ddc5da507a12a29b14e4ae5bda06b19f575a3e6079d2e2ae" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" dependencies = [ "bytemuck_derive", ] @@ -5446,6 +5446,7 @@ dependencies = [ "assert-json-diff", "atoi_simd 0.16.0", "base62", + "bytemuck", "byteorder", "bytes", "cached", diff --git a/Cargo.toml b/Cargo.toml index 4fdd14fd3..b1151f579 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,6 +75,7 @@ arrow = { version = "53", default-features = false, features = [ ], optional = true } atoi_simd = "0.16" base62 = { version = "2.0", optional = true } +bytemuck = { version = "1.19", features = ["latest_stable_rust"], optional = true } byteorder = "1.5" bytes = "1" cached = { version = "0.53", features = [ @@ -344,7 +345,6 @@ apply = [ "strsim", "thousands", "titlecase", - # "vader_sentiment", "whatlang", ] fetch = [ @@ -361,6 +361,7 @@ fetch = [ ] foreach = ["local-encoding"] geocode = [ + "bytemuck", "cached", "geosuggest-core", "geosuggest-utils", @@ -368,6 +369,7 @@ geocode = [ "simple-expand-tilde", ] luau = ["mlua", "sanitize-filename", "simple-expand-tilde"] +polars = [ "dep:polars", "bytemuck" ] prompt = ["rfd"] python = ["pyo3"] to = ["csvs_convert", "xlsxwriter"] From 220514d3823ff10953add5a47e09f13bc6107955 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 08:24:38 -0400 Subject: [PATCH 034/119] BetterTOML format --- Cargo.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b1151f579..081e12129 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,7 +75,9 @@ arrow = { version = "53", default-features = false, features = [ ], optional = true } atoi_simd = "0.16" base62 = { version = "2.0", optional = true } -bytemuck = { version = "1.19", features = ["latest_stable_rust"], optional = true } +bytemuck = { version = "1.19", features = [ + "latest_stable_rust", +], optional = true } byteorder = "1.5" bytes = "1" cached = { version = "0.53", features = [ @@ -369,7 +371,7 @@ geocode = [ "simple-expand-tilde", ] luau = ["mlua", "sanitize-filename", "simple-expand-tilde"] -polars = [ "dep:polars", "bytemuck" ] +polars = ["dep:polars", "bytemuck"] prompt = ["rfd"] python = ["pyo3"] to = ["csvs_convert", "xlsxwriter"] From 2eb7022b8655ac74bed9bd77b57f1d03249a78aa Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 10:16:53 -0400 Subject: [PATCH 035/119] `extdedup`: typo c + p error [skip ci] --- src/cmd/extdedup.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cmd/extdedup.rs b/src/cmd/extdedup.rs index f0b1abe5f..9abe81baa 100644 --- a/src/cmd/extdedup.rs +++ b/src/cmd/extdedup.rs @@ -25,7 +25,7 @@ Usage: extdedup options: -s, --select Select a subset of columns to dedup. Note that the outputs will remain at the full width of the CSV. - If --select is NOT set, extdedup will work in LINE MODE, sorting + If --select is NOT set, extdedup will work in LINE MODE, deduping the input as a text file on a line-by-line basis. --no-output Do not write deduplicated output to . Use this if you only want to know the duplicate count. @@ -43,7 +43,7 @@ extdedup options: Common options: CSV MODE ONLY: -n, --no-headers When set, the first row will not be interpreted - as headers. That is, it will be sorted with the rest + as headers. That is, it will be deduped with the rest of the rows. Otherwise, the first row will always appear as the header row in the output. -d, --delimiter The field delimiter for reading CSV data. From 0ab913e0f980355e910c2a2c18c6a165e175c87d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 10:55:53 -0400 Subject: [PATCH 036/119] `extdedup`: make calculate_memory_limit public so we can use it in `extsort` [skip ci] --- src/cmd/extdedup.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/extdedup.rs b/src/cmd/extdedup.rs index 9abe81baa..3eedd04c8 100644 --- a/src/cmd/extdedup.rs +++ b/src/cmd/extdedup.rs @@ -278,7 +278,7 @@ fn dedup_lines(args: Args, mem_limited_buffer: u64) -> Result 50, it's treated as megabytes, but capped at 90% of total system memory. -fn calculate_memory_limit(flag_memory_limit: Option) -> u64 { +pub fn calculate_memory_limit(flag_memory_limit: Option) -> u64 { if !sysinfo::IS_SUPPORTED_SYSTEM { return MEMORY_LIMITED_BUFFER; } From 7bb510914c418ed22f939a9a983f53d9854abc21 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 18:24:09 -0400 Subject: [PATCH 037/119] `tests`: add test file for extsort csvmode --- .../adur-public-toilets-extsorted-csvmode.csv | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 resources/test/adur-public-toilets-extsorted-csvmode.csv diff --git a/resources/test/adur-public-toilets-extsorted-csvmode.csv b/resources/test/adur-public-toilets-extsorted-csvmode.csv new file mode 100644 index 000000000..3a8805846 --- /dev/null +++ b/resources/test/adur-public-toilets-extsorted-csvmode.csv @@ -0,0 +1,16 @@ +ExtractDate,OrganisationURI,OrganisationLabel,ServiceTypeURI,ServiceTypeLabel,LocationText,CoordinateReferenceSystem,GeoX,GeoY,GeoPointLicensingURL,Category,AccessibleCategory,RADARKeyNeeded,BabyChange,FamilyToilet,ChangingPlace,AutomaticPublicConvenience,FullTimeStaffing,PartOfCommunityScheme,CommunitySchemeName,ChargeAmount,InfoURL,OpeningHours,ManagedBy,ReportEmail,ReportTel,Notes,UPRN,Postcode,StreetAddress,GeoAreaURI,GeoAreaLabel +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,522083,105168,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,09.00 - 17.00,ADC,surveyor_15@adur-worthing.gov.uk,01903 221471,,60034215,,PUBLIC CONVENIENCES CIVIC CENTRE HAM ROAD SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,522007,106062,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,,ADC,surveyor_14@adur-worthing.gov.uk,01903 221471,Grounds staff only not public,60032527,,PUBLIC CONVENIENCE NORTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES CEMETERY MILL LANE SHOREHAM-BY-SEA,OSGB36,521440,105725,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,,ADC,surveyor_9@adur-worthing.gov.uk,01903 221471,Grounds staff only not public,60014340,,PUBLIC CONVENIENCES CEMETERY MILL LANE SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCE SOUTHWICK STREET SOUTHWICK,OSGB36,524401,105405,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 08:00 - 21:00 W = 08:00 - 17:00,ADC,surveyor_11@adur-worthing.gov.uk,01903 221471,,60026354,,PUBLIC CONVENIENCE SOUTHWICK STREET SOUTHWICK,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES ADUR RECREATION GROUND BRIGHTON ROAD SHOREHAM-BY-SEA,OSGB36,521048,104977,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 08:00 - 21:00 W = 08:00 - 17:00,ADC,surveyor_6@adur-worthing.gov.uk,01903 221471,,60009666,,PUBLIC CONVENIENCES ADUR RECREATION GROUND BRIGHTON ROAD SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES MONKS RECREATION GROUND CRABTREE LANE LANCING,OSGB36,518225,104730,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 15:00 W = 09:00 - 15:00,ADC,surveyor_2@adur-worthing.gov.uk,01903 221471,,60002210,,PUBLIC CONVENIENCES MONKS RECREATION GROUND CRABTREE LANE LANCING,, +,http://opendatacommunities.org/id/district-council/adur,,http://id.esd.org.uk/service/579,Public toilets,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,OSGB36,518072,103649,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00 ,ADC,surveyor_1@adur-worthing.gov.uk,01903 221471,,60001449,,BEACH GREEN PUBLIC CONVENIENCES BRIGHTON ROAD LANCING,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,OSGB36,524375,104753,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_13@adur-worthing.gov.uk,01903 221471,,60029181,,BEACH TOILETS BASIN ROAD SOUTH SOUTHWICK,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES BEACH GREEN SHOREHAM-BY-SEA,OSGB36,521299,104515,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_5@adur-worthing.gov.uk,01903 221471,,60009402,,PUBLIC CONVENIENCES BEACH GREEN SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES FORTHAVEN SHOREHAM-BY-SEA,OSGB36,523294,104588,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_7@adur-worthing.gov.uk,01903 221471,,60011970,,PUBLIC CONVENIENCES FORTHAVEN SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES MIDDLE STREET SHOREHAM-BY-SEA,OSGB36,521515,105083,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_8@adur-worthing.gov.uk,01903 221471,,60014163,,PUBLIC CONVENIENCES MIDDLE STREET SHOREHAM-BY-SEA,, +2014-07-07 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SHOPSDAM ROAD LANCING,OSGB3,518915,103795,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Mens,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_3@adur-worthing.gov.uk,01903 221471,,60007428,,,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES SOUTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,OSGB36,522118,105939,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,None,No,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_10@adur-worthing.gov.uk,01903 221471,,60017866,,PUBLIC CONVENIENCES SOUTH PAVILION BUCKINGHAM PARK UPPER SHOREHAM ROAD SHOREHAM-BY-SEA,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,PUBLIC CONVENIENCES YEW TREE CLOSE LANCING,OSGB36,518222,104168,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,ADC,surveyor_4@adur-worthing.gov.uk,01903 221471,,60008859,,PUBLIC CONVENIENCES YEW TREE CLOSE LANCING,, +07/07/2014 00:00,http://opendatacommunities.org/id/district-council/adur,Adur,http://id.esd.org.uk/service/579,Public toilets,WEST BEACH PUBLIC CONVENIENCES WEST BEACH ROAD LANCING,OSGB36,520354,104246,http://www.ordnancesurvey.co.uk/business-and-government/help-and-support/public-sector/guidance/derived-data-exemptions.html,Female and male,Unisex,Yes,No,No,No,No,No,No,,,http://www.adur-worthing.gov.uk/streets-and-travel/public-toilets/,S = 09:00 - 21:00 W = 09:00 - 17:00,,surveyor_12@adur-worthing.gov.uk,01903 221471,,60028994,,WEST BEACH PUBLIC CONVENIENCES WEST BEACH ROAD LANCING,, From c1c75eb6b6fcf1a1e7e7da164ce740800ada5cc9 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 18:24:59 -0400 Subject: [PATCH 038/119] `extsort`: like `extdedup`, it now has two modes - CSV mode and LINE mode --- src/cmd/extsort.rs | 265 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 215 insertions(+), 50 deletions(-) diff --git a/src/cmd/extsort.rs b/src/cmd/extsort.rs index 127c79d3d..a26643b11 100644 --- a/src/cmd/extsort.rs +++ b/src/cmd/extsort.rs @@ -1,17 +1,30 @@ static USAGE: &str = r#" Sort an arbitrarily large CSV/text file using a multithreaded external sort algorithm. -This command is not specific to CSV data, it sorts any text file on a -line-by-line basis. If sorting a non-CSV file, be sure to set --no-headers, -otherwise, the first line will not be included in the external sort. +This command has TWO modes of operation. + + * CSV MODE + when --select is set, it sorts based on the given column/s. Requires an index. + See `qsv select --help` for select syntax details. + * LINE MODE + when --select is NOT set, it sorts any input text file (not just CSVs) on a + line-by-line basis. If sorting a non-CSV file, be sure to set --no-headers, + otherwise, the first line will not be included in the external sort. Usage: qsv extsort [options] [] [] qsv extsort --help External sort option: - --memory-limit The maximum amount of memory to buffer the on-disk hash table. - This is a percentage of total memory. + -s, --select Select a subset of columns to sort (CSV MODE). + Note that the outputs will remain at the full width of the CSV. + If --select is NOT set, extsort will work in LINE MODE, sorting + the input as a text file on a line-by-line basis. + -R, --reverse Reverse order + --memory-limit The maximum amount of memory to buffer the external merge sort. + If less than 50, this is a percentage of total memory. + If more than 50, this is the memory in MB to allocate, capped + at 90 percent of total memory. [default: 10] --tmp-dir The directory to use for externally sorting file segments. [default: ./] @@ -20,6 +33,10 @@ External sort option: number of CPUs detected. Common options: + CSV MODE ONLY: + -d, --delimiter The field delimiter for reading CSV data. + Must be a single character. (default: ,) + -h, --help Display this message -n, --no-headers When set, the first row will not be interpreted as headers and will be sorted with the rest @@ -35,21 +52,28 @@ use std::{ use ext_sort::{buffer::mem::MemoryLimitedBufferBuilder, ExternalSorter, ExternalSorterBuilder}; use serde::Deserialize; -use sysinfo::System; -use crate::{config, util, CliResult}; +use crate::{ + cmd::extdedup::calculate_memory_limit, + config, + config::{Config, Delimiter}, + select::SelectColumns, + util, CliResult, +}; #[derive(Deserialize)] struct Args { arg_input: Option, arg_output: Option, + flag_select: Option, + flag_reverse: bool, + flag_delimiter: Option, flag_jobs: Option, - flag_memory_limit: Option, + flag_memory_limit: Option, flag_tmp_dir: Option, flag_no_headers: bool, } -const MEMORY_LIMITED_BUFFER: u64 = 100 * 1_000_000; // 100 MB const RW_BUFFER_CAPACITY: usize = 1_000_000; // 1 MB pub fn run(argv: &[&str]) -> CliResult<()> { @@ -66,20 +90,172 @@ pub fn run(argv: &[&str]) -> CliResult<()> { None => "./".to_string(), }; - // memory buffer to use for external merge sort, - // if we can detect the total memory, use 10% of it by default - // and up to --memory-limit (capped at 50%), - // otherwise, if we cannot detect the free memory use a default of 100 MB - let mem_limited_buffer = if sysinfo::IS_SUPPORTED_SYSTEM { - let mut sys = System::new(); - sys.refresh_memory(); - (sys.total_memory() * 1000) / u8::min(args.flag_memory_limit.unwrap_or(10), 50) as u64 + // Set the memory buffer size for the external merge sort based on --memory-limit + // and system capabilities. + let mem_limited_buffer_bytes = calculate_memory_limit(args.flag_memory_limit); + log::info!("{mem_limited_buffer_bytes} bytes used for in memory mergesort buffer..."); + + let sorter: ExternalSorter = + match ExternalSorterBuilder::new() + .with_tmp_dir(path::Path::new(&tmp_dir)) + .with_buffer(MemoryLimitedBufferBuilder::new(mem_limited_buffer_bytes)) + .with_rw_buf_size(RW_BUFFER_CAPACITY) + .with_threads_number(util::njobs(args.flag_jobs)) + .build() + { + Ok(sorter) => sorter, + Err(e) => { + return fail_clierror!("cannot create external sorter: {e}"); + }, + }; + + if args.flag_select.is_some() { + sort_csv(&args, &tmp_dir, &sorter) + } else { + sort_lines(&args, &sorter) + } +} + +fn sort_csv( + args: &Args, + tmp_dir: &str, + sorter: &ExternalSorter, +) -> Result<(), crate::clitypes::CliError> { + let rconfig = Config::new(args.arg_input.as_ref()) + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers) + .select(args.flag_select.clone().unwrap()); + + let mut idxfile = if let Ok(idx) = rconfig.indexed() { + if idx.is_none() { + return fail_incorrectusage_clierror!("extsort CSV mode requires an index"); + } + idx.unwrap() + } else { + return fail_incorrectusage_clierror!("extsort CSV mode requires an index"); + }; + + let mut input_rdr = rconfig.reader()?; + + let linewtr_tfile = tempfile::NamedTempFile::new_in(tmp_dir)?; + let mut line_wtr = io::BufWriter::with_capacity(RW_BUFFER_CAPACITY, linewtr_tfile.as_file()); + + let headers = input_rdr.byte_headers()?.clone(); + let sel = rconfig.selection(&headers)?; + + let mut sort_key = String::with_capacity(20); + let mut utf8_string = String::with_capacity(20); + let mut curr_row = csv::ByteRecord::new(); + + let rowcount = idxfile.count(); + let width = rowcount.to_string().len(); + + // first pass. get the selected columns, and the record position + // then write them to a temp text file with the selected columns and the position + // separated by "". Pad the position with leading zeroes, so it will always be the same width + for row in input_rdr.byte_records() { + curr_row.clone_from(&row?); + sort_key.clear(); + for field in sel.select(&curr_row) { + if let Ok(s_utf8) = simdutf8::basic::from_utf8(field) { + sort_key.push_str(s_utf8); + } else { + utf8_string.clear(); + utf8_string.push_str(&String::from_utf8_lossy(field)); + sort_key.push_str(&utf8_string); + } + } + let idx_position = curr_row.position().unwrap(); + + sort_key.push_str(&format!("{:01$}", idx_position.line(), width)); + + writeln!(line_wtr, "{sort_key}")?; + } + line_wtr.flush()?; + + let line_rdr = io::BufReader::with_capacity( + RW_BUFFER_CAPACITY, + std::fs::File::open(linewtr_tfile.path())?, + ); + + let reverse_flag = args.flag_reverse; + let compare = |a: &String, b: &String| { + if reverse_flag { + a.cmp(b).reverse() + } else { + a.cmp(b) + } + }; + + // Now sort the temp text file + let sorted = match sorter.sort_by(line_rdr.lines(), compare) { + Ok(sorted) => sorted, + Err(e) => { + return fail!(format!("cannot do external sort: {e:?}")); + }, + }; + + let sorted_tfile = tempfile::NamedTempFile::new_in(tmp_dir)?; + let mut sorted_line_wtr = + io::BufWriter::with_capacity(RW_BUFFER_CAPACITY, sorted_tfile.as_file()); + + for item in sorted.map(Result::unwrap) { + sorted_line_wtr.write_all(format!("{item}\n").as_bytes())?; + } + sorted_line_wtr.flush()?; + // Delete the temporary file containing unsorted lines + drop(line_wtr); + linewtr_tfile.close()?; + + // now write the sorted CSV file by reading the sorted_line temp file + // and extracting the position from each line + // and then using that to seek the input file to retrieve the record + // and then write the record to the final sorted CSV + let sorted_lines = std::fs::File::open(sorted_tfile.path())?; + let sorted_line_rdr = io::BufReader::with_capacity(RW_BUFFER_CAPACITY, sorted_lines); + + let mut sorted_csv_wtr = Config::new(args.arg_output.as_ref()).writer()?; + + let position_delta: u64 = if args.flag_no_headers { + 1 } else { - MEMORY_LIMITED_BUFFER + // Write the header row if --no-headers is false + sorted_csv_wtr.write_byte_record(&headers)?; + 2 }; - log::info!("{mem_limited_buffer} bytes used for in memory mergesort buffer..."); - let mut input_reader: Box = match &args.arg_input { + // amortize allocations + let mut record_wrk = csv::ByteRecord::new(); + let mut line = String::new(); + #[allow(unused_assignments)] + let mut line_parts: Vec<&str> = Vec::with_capacity(2); + + for l in sorted_line_rdr.lines() { + line.clone_from(&l?); + line_parts = line.rsplitn(2, "").collect(); + if line_parts.len() != 2 { + return fail_clierror!("Invalid sorted line format"); + } + let position: u64 = line_parts[0] + .parse() + .map_err(|e| format!("Failed to retrieve position: {e}"))?; + + idxfile.seek(position - position_delta)?; + idxfile.read_byte_record(&mut record_wrk)?; + sorted_csv_wtr.write_byte_record(&record_wrk)?; + } + sorted_csv_wtr.flush()?; + drop(sorted_line_wtr); + sorted_tfile.close()?; + + Ok(()) +} + +fn sort_lines( + args: &Args, + sorter: &ExternalSorter, +) -> Result<(), crate::clitypes::CliError> { + let mut input_rdr: Box = match &args.arg_input { Some(input_path) => { if input_path.to_lowercase().ends_with(".sz") { return fail_incorrectusage_clierror!( @@ -95,7 +271,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { None => Box::new(io::BufReader::new(stdin().lock())), }; - let mut output_writer: Box = match &args.arg_output { + let mut output_wtr: Box = match &args.arg_output { Some(output_path) => { if output_path.to_lowercase().ends_with(".sz") { return fail_clierror!( @@ -114,45 +290,34 @@ pub fn run(argv: &[&str]) -> CliResult<()> { )), }; - let sorter: ExternalSorter = - match ExternalSorterBuilder::new() - .with_tmp_dir(path::Path::new(&tmp_dir)) - .with_buffer(MemoryLimitedBufferBuilder::new(mem_limited_buffer)) - .with_rw_buf_size(RW_BUFFER_CAPACITY) - .with_threads_number(util::njobs(args.flag_jobs)) - .build() - { - Ok(sorter) => sorter, - Err(e) => { - return fail_clierror!("cannot create external sorter: {e}"); - }, - }; - let mut header = String::new(); if !args.flag_no_headers { - input_reader.read_line(&mut header)?; + input_rdr.read_line(&mut header)?; } - let Ok(sorted) = sorter.sort(input_reader.lines()) else { - return fail!("cannot do external sort"); + let reverse_flag = args.flag_reverse; + let compare = |a: &String, b: &String| { + if reverse_flag { + a.cmp(b).reverse() + } else { + a.cmp(b) + } + }; + + let sorted = match sorter.sort_by(input_rdr.lines(), compare) { + Ok(sorted) => sorted, + Err(e) => { + return fail!(format!("cannot do external sort: {e:?}")); + }, }; if !header.is_empty() { - output_writer.write_all(format!("{}\n", header.trim_end()).as_bytes())?; + output_wtr.write_all(format!("{}\n", header.trim_end()).as_bytes())?; } for item in sorted.map(Result::unwrap) { - output_writer.write_all(format!("{item}\n").as_bytes())?; + output_wtr.write_all(format!("{item}\n").as_bytes())?; } - output_writer.flush()?; + output_wtr.flush()?; Ok(()) } - -#[test] -fn test_mem_check() { - // check to see if sysinfo return meminfo without segfaulting - let mut sys = System::new(); - sys.refresh_memory(); - let mem10percent = (sys.total_memory() * 1000) / 10; // 10 percent of total memory - assert!(mem10percent > 0); -} From cdfd85f08f0d939791541b134d2887ba14e8345f Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 18:25:20 -0400 Subject: [PATCH 039/119] `tests`: add `extsort` csv mode test --- tests/test_extsort.rs | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/test_extsort.rs b/tests/test_extsort.rs index ba33a1ece..d91bb3788 100644 --- a/tests/test_extsort.rs +++ b/tests/test_extsort.rs @@ -3,8 +3,8 @@ use newline_converter::dos2unix; use crate::workdir::Workdir; #[test] -fn extsort() { - let wrk = Workdir::new("extsort").flexible(true); +fn extsort_linemode() { + let wrk = Workdir::new("extsort_linemode").flexible(true); wrk.clear_contents().unwrap(); // copy csv file to workdir @@ -24,3 +24,32 @@ fn extsort() { assert_eq!(dos2unix(&sorted_output), dos2unix(&expected_csv)); } + +#[test] +fn extsort_csvmode() { + let wrk = Workdir::new("extsort_csvmode").flexible(true); + wrk.clear_contents().unwrap(); + + // copy csv file to workdir + let unsorted_csv = wrk.load_test_resource("adur-public-toilets.csv"); + wrk.create_from_string("adur-public-toilets.csv", &unsorted_csv); + + // set the environment variable to autoindex + std::env::set_var("QSV_AUTOINDEX_SIZE", "1"); + + let mut cmd = wrk.command("extsort"); + cmd.arg("adur-public-toilets.csv") + .args(["--select", "OpeningHours,StreetAddress,LocationText"]) + .arg("adur-public-toilets-extsort-csvmode.csv"); + wrk.output(&mut cmd); + // unset the environment variable + std::env::remove_var("QSV_AUTOINDEX_SIZE"); + + // load sorted output + let sorted_output: String = wrk.from_str(&wrk.path("adur-public-toilets-extsort-csvmode.csv")); + + let expected_csv = wrk.load_test_resource("adur-public-toilets-extsorted-csvmode.csv"); + wrk.create_from_string("adur-public-toilets-extsorted-csvmode.csv", &expected_csv); + + assert_eq!(dos2unix(&sorted_output), dos2unix(&expected_csv)); +} From 76d7bde87d7a145e2e4396d4d20ac886b61f12f9 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 18:53:32 -0400 Subject: [PATCH 040/119] =?UTF-8?q?`docs`:=20update=20`extsort`=20emoji=20?= =?UTF-8?q?legend=20-=20adding=20index(=F0=9F=93=87)=20&=20selector(?= =?UTF-8?q?=F0=9F=91=86)=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fa383a58a..ebac99636 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ | [exclude](/src/cmd/exclude.rs#L2)
📇👆 | Removes a set of CSV data from another set based on the specified columns. | | [explode](/src/cmd/explode.rs#L2)
🔣👆 | Explode rows into multiple ones by splitting a column value based on the given separator. | | [extdedup](/src/cmd/extdedup.rs#L2)
👆 | Remove duplicate rows from an arbitrarily large CSV/text file using a memory-mapped, [on-disk hash table](https://crates.io/crates/odht). Unlike the `dedup` command, this command does not load the entire file into memory nor does it sort the deduped file. | -| [extsort](/src/cmd/extsort.rs#L2)
🚀 | Sort an arbitrarily large CSV/text file using a multithreaded [external merge sort](https://en.wikipedia.org/wiki/External_sorting) algorithm. | +| [extsort](/src/cmd/extsort.rs#L2)
🚀📇👆 | Sort an arbitrarily large CSV/text file using a multithreaded [external merge sort](https://en.wikipedia.org/wiki/External_sorting) algorithm. | | [fetch](/src/cmd/fetch.rs#L3)
✨🧠🌐 | Fetches data from web services for every row using **HTTP Get**. Comes with [HTTP/2](https://http2-explained.haxx.se/en/part1) [adaptive flow control](https://medium.com/coderscorner/http-2-flow-control-77e54f7fd518), [jql](https://github.com/yamafaktory/jql#%EF%B8%8F-usage) JSON query language support, dynamic throttling ([RateLimit](https://www.ietf.org/archive/id/draft-ietf-httpapi-ratelimit-headers-06.html)) & caching with available persistent caching using [Redis](https://redis.io/) or a disk-cache. | | [fetchpost](/src/cmd/fetchpost.rs#L3)
✨🧠🌐 | Similar to `fetch`, but uses **HTTP Post**. ([HTTP GET vs POST methods](https://www.geeksforgeeks.org/difference-between-http-get-and-post-methods/)) | | [fill](/src/cmd/fill.rs#L2)
👆 | Fill empty values. | From b0b0be4e0b0558f72d067d8ad536a316c46dba52 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 18:55:28 -0400 Subject: [PATCH 041/119] `docs`: update number of tests [skip ci] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ebac99636..1b2d1f110 100644 --- a/README.md +++ b/README.md @@ -435,7 +435,7 @@ Luau will also serve as the backbone of a whole library of **qsv recipes** - reu as command line interfaces go :shrug:. Its commands have numerous options but have sensible defaults. The usage text is written for a data analyst audience, not developers; and there are numerous examples in the usage text, with the tests doubling as examples as well. With [qsv pro](https://qsvpro.dathere.com), it has much expanded functionality while being easier to use with its Graphical User Interface. * **As Secure as Possible** - qsv is designed to be secure. It has no external runtime dependencies, is [written](https://aws.amazon.com/blogs/opensource/why-aws-loves-rust-and-how-wed-like-to-help/) [in](https://msrc.microsoft.com/blog/2019/07/why-rust-for-safe-systems-programming/) [Rust](https://opensource.googleblog.com/2023/06/rust-fact-vs-fiction-5-insights-from-googles-rust-journey-2022.html), and it's codebase is automatically audited for security vulnerabilities with automated [DevSkim](https://github.com/microsoft/DevSkim#devskim), ["cargo audit"](https://rustsec.org) and [Codacy](https://app.codacy.com/gh/jqnatividad/qsv/dashboard) Github Actions workflows. It uses the latest stable Rust version, with an aggressive MSRV policy and the latest version of all its dependencies. -It has an extensive test suite with ~1,550 tests, including several [property tests](https://medium.com/criteo-engineering/introduction-to-property-based-testing-f5236229d237) which [randomly generate](https://github.com/BurntSushi/quickcheck#quickcheck) parameters for oft-used commands. +It has an extensive test suite with ~1,570 tests, including several [property tests](https://medium.com/criteo-engineering/introduction-to-property-based-testing-f5236229d237) which [randomly generate](https://github.com/BurntSushi/quickcheck#quickcheck) parameters for oft-used commands. Its prebuilt binary archives are [zipsigned](https://github.com/Kijewski/zipsign#zipsign), so you can [verify their integrity](#verifying-the-integrity-of-the-prebuilt-binaries-zip-archives). Its self-update mechanism automatically verifies the integrity of the prebuilt binaries archive before applying an update. See [Security](docs/SECURITY.md) for more info. * **As Easy to Contribute to as Possible** - qsv is designed to be easy to contribute to, with a focus on maintainability. It's modular architecture allows the easy addition of self-contained commands gated by feature flags, the source code is heavily commented, the usage text is embedded, and there are helper functions that make it easy to create additional commands and supporting tests. See [Features](docs/FEATURES.md) and [Contributing](CONTRIBUTING.md) for more info. @@ -448,7 +448,7 @@ It can process well-formed CSVs in _any_ language so long as its UTF-8 encoded. Finally, though the default Geonames index of the `geocode` command is English-only, the index can be rebuilt with the `geocode index-update` subcommand with the `--languages` option to return place names in multiple languages ([with support for 254 languages](http://download.geonames.org/export/dump/alternatenames/)). ## Testing -qsv has ~1,550 tests in the [tests](https://github.com/jqnatividad/qsv/tree/master/tests) directory. +qsv has ~1,570 tests in the [tests](https://github.com/jqnatividad/qsv/tree/master/tests) directory. Each command has its own test suite in a separate file with the convention `test_.rs`. Apart from preventing regressions, the tests also serve as good illustrative examples, and are often linked from the usage text of each corresponding command. From 31dfb40948fa2d461990d3b2d0841b2462fb4e68 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 19:13:59 -0400 Subject: [PATCH 042/119] `deps`: update polars to latest upstream; bump async-compression from 0.4.14 to 0.4.15 --- Cargo.lock | 42 +++++++++++++++++++++--------------------- Cargo.toml | 4 ++-- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index af295bc03..3dcb78fd3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -612,9 +612,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "998282f8f49ccd6116b0ed8a4de0fbd3151697920e7c7533416d6e25e76434a7" +checksum = "e26a9844c659a2a293d239c7910b752f8487fe122c6c8bd1659bf85a6507c302" dependencies = [ "brotli 7.0.0", "flate2", @@ -4750,7 +4750,7 @@ dependencies = [ [[package]] name = "polars" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "getrandom", "polars-arrow", @@ -4769,7 +4769,7 @@ dependencies = [ [[package]] name = "polars-arrow" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "atoi", @@ -4817,7 +4817,7 @@ dependencies = [ [[package]] name = "polars-compute" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "bytemuck", "either", @@ -4832,7 +4832,7 @@ dependencies = [ [[package]] name = "polars-core" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4866,7 +4866,7 @@ dependencies = [ [[package]] name = "polars-error" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "avro-schema", "object_store", @@ -4879,7 +4879,7 @@ dependencies = [ [[package]] name = "polars-expr" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4899,7 +4899,7 @@ dependencies = [ [[package]] name = "polars-io" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "async-trait", @@ -4945,7 +4945,7 @@ dependencies = [ [[package]] name = "polars-json" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "chrono", @@ -4966,7 +4966,7 @@ dependencies = [ [[package]] name = "polars-lazy" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4992,7 +4992,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "futures", "memmap2", @@ -5013,7 +5013,7 @@ dependencies = [ [[package]] name = "polars-ops" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "argminmax", @@ -5046,7 +5046,7 @@ dependencies = [ [[package]] name = "polars-parquet" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "async-stream", @@ -5074,7 +5074,7 @@ dependencies = [ [[package]] name = "polars-pipe" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -5100,7 +5100,7 @@ dependencies = [ [[package]] name = "polars-plan" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "bitflags 2.6.0", @@ -5133,7 +5133,7 @@ dependencies = [ [[package]] name = "polars-row" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "bytemuck", "polars-arrow", @@ -5144,7 +5144,7 @@ dependencies = [ [[package]] name = "polars-schema" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "indexmap", "polars-error", @@ -5156,7 +5156,7 @@ dependencies = [ [[package]] name = "polars-sql" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "hex", "once_cell", @@ -5177,7 +5177,7 @@ dependencies = [ [[package]] name = "polars-time" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "atoi", "bytemuck", @@ -5197,7 +5197,7 @@ dependencies = [ [[package]] name = "polars-utils" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=48f6e9d#48f6e9dc7c62f60ee9cb2ef9313917b63c96b1f7" +source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" dependencies = [ "ahash", "bytemuck", diff --git a/Cargo.toml b/Cargo.toml index 081e12129..884247a52 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -314,9 +314,9 @@ governor = { git = "https://github.com/jqnatividad/governor", branch = "deps-bum # BUILD NOTE: Be sure to set QSV_POLARS_REV below to the latest commit short hash or tag # of polars/py-polars before building qsv. This allows us to show the polars rev/tag in --version. # if we are using a release version of Rust Polars, leave QSV_POLARS_REV empty -# QSV_POLARS_REV=48f6e9d +# QSV_POLARS_REV=ff10b38 # polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.9.0" } -polars = { git = "https://github.com/pola-rs/polars", rev = "48f6e9d" } +polars = { git = "https://github.com/pola-rs/polars", rev = "ff10b38" } [features] default = ["mimalloc"] From 5342d6ae248c4f59fab0622592849a8eac873b93 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 22:33:46 -0400 Subject: [PATCH 043/119] `benchmarks`: v5.1.0 added extdedup_csv and extsort_csv benchmarks [skip ci] --- scripts/benchmarks.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh index 7fe7cb9d7..5b5aca4a5 100755 --- a/scripts/benchmarks.sh +++ b/scripts/benchmarks.sh @@ -42,7 +42,7 @@ arg_pat="$1" # the version of this script -bm_version=5.1.0 +bm_version=5.2.0 # CONFIGURABLE VARIABLES --------------------------------------- # change as needed to reflect your environment/workloads @@ -496,7 +496,9 @@ run exclude_multi_casei "$qsv_bin" exclude --ignore-case \'Incident Zip,Communit run --index exclude_multi_casei_index "$qsv_bin" exclude --ignore-case \'Incident Zip,Community Board,Agency\' "$data" \'Incident Zip,Community Board,Agency\' data_to_exclude.csv run explode "$qsv_bin" explode City "-" "$data" run extdedup "$qsv_bin" extdedup "$data" +run extdedup_csv "$qsv_bin" extdedup "$data" --select 1-5 run extsort "$qsv_bin" extsort data_unsorted.csv extsort_sorted.csv +run --index extsort_csv "$qsv_bin" extsort data_unsorted.csv --select 1-5 extsort_sorted.csv run fill "$qsv_bin" fill -v Unspecified \'Address Type\' "$data" run fixlengths "$qsv_bin" fixlengths "$data" run flatten "$qsv_bin" flatten "$data" From 28c7f38231984895ab16b241773c013c86137c45 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 22:48:25 -0400 Subject: [PATCH 044/119] `docs`: add to PERFORMANCE.md `extsort` index requirement & index metrics [skip ci] --- docs/PERFORMANCE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/PERFORMANCE.md b/docs/PERFORMANCE.md index 95f4a7019..b27ba6054 100644 --- a/docs/PERFORMANCE.md +++ b/docs/PERFORMANCE.md @@ -10,9 +10,9 @@ Indexing your CSV files is key for performance. Here's why: 3. **Parallel Processing**: Indexing enables multithreading, dramatically speeding up supported commands like `stats`, `frequency`, `sample`, `split` and `tojsonl`. -4. **Random Access**: The `luau` command gains random access capabilities. +4. **Random Access**: The `luau` command gains random access capabilities. `extsort` CSV mode requires an index. -5. **Low Overhead**: Creating an index is fast and efficient, even for very large files. +5. **Low Overhead**: Creating an index is fast and efficient, even for very large files. The million row, 41-column, 520mb NYC 311 benchmark file for instance, takes all of 466 ms to index. Even if you're only handling a CSV file once, and its not reference data, indexing still makes sense if you're `slicing`, `counting`, `sampling` or compiling summary statistics with the `stats` and `frequency` commands. From 08e8b290232d7d7ac4f11325d9b28701dfa44aec Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 13 Oct 2024 23:40:22 -0400 Subject: [PATCH 045/119] `completions`: add `extsort` and `extdedup` new CSV options [skip ci] --- contrib/completions/examples/qsv.bash | 4 ++-- contrib/completions/examples/qsv.elv | 6 ++++++ contrib/completions/examples/qsv.fig.js | 18 ++++++++++++++++++ contrib/completions/examples/qsv.fish | 6 ++++++ contrib/completions/examples/qsv.nu | 6 ++++++ contrib/completions/examples/qsv.ps1 | 6 ++++++ contrib/completions/examples/qsv.zsh | 6 ++++++ contrib/completions/src/cmd/extdedup.rs | 3 +++ contrib/completions/src/cmd/extsort.rs | 3 +++ 9 files changed, 56 insertions(+), 2 deletions(-) diff --git a/contrib/completions/examples/qsv.bash b/contrib/completions/examples/qsv.bash index 34c86432f..643149478 100644 --- a/contrib/completions/examples/qsv.bash +++ b/contrib/completions/examples/qsv.bash @@ -1180,7 +1180,7 @@ _qsv() { return 0 ;; qsv__extdedup) - opts="-h --no-output --dupes-output --human-readable --memory-limit --quiet --help" + opts="-h --select --no-output --dupes-output --human-readable --memory-limit --no-headers --delimiter --quiet --help" if [[ ${cur} == -* || ${COMP_CWORD} -eq 2 ]] ; then COMPREPLY=( $(compgen -W "${opts}" -- "${cur}") ) return 0 @@ -1194,7 +1194,7 @@ _qsv() { return 0 ;; qsv__extsort) - opts="-h --memory-limit --tmp-dir --jobs --no-headers --help" + opts="-h --select --reverse --memory-limit --tmp-dir --jobs --delimiter --no-headers --help" if [[ ${cur} == -* || ${COMP_CWORD} -eq 2 ]] ; then COMPREPLY=( $(compgen -W "${opts}" -- "${cur}") ) return 0 diff --git a/contrib/completions/examples/qsv.elv b/contrib/completions/examples/qsv.elv index 2ad7c31c3..81fd97f93 100644 --- a/contrib/completions/examples/qsv.elv +++ b/contrib/completions/examples/qsv.elv @@ -380,18 +380,24 @@ set edit:completion:arg-completer[qsv] = {|@words| cand --help 'Print help' } &'qsv;extdedup'= { + cand --select 'select' cand --no-output 'no-output' cand --dupes-output 'dupes-output' cand --human-readable 'human-readable' cand --memory-limit 'memory-limit' + cand --no-headers 'no-headers' + cand --delimiter 'delimiter' cand --quiet 'quiet' cand -h 'Print help' cand --help 'Print help' } &'qsv;extsort'= { + cand --select 'select' + cand --reverse 'reverse' cand --memory-limit 'memory-limit' cand --tmp-dir 'tmp-dir' cand --jobs 'jobs' + cand --delimiter 'delimiter' cand --no-headers 'no-headers' cand -h 'Print help' cand --help 'Print help' diff --git a/contrib/completions/examples/qsv.fig.js b/contrib/completions/examples/qsv.fig.js index f934df9a9..b5fe30a75 100644 --- a/contrib/completions/examples/qsv.fig.js +++ b/contrib/completions/examples/qsv.fig.js @@ -743,6 +743,9 @@ const completion: Fig.Spec = { { name: "extdedup", options: [ + { + name: "--select", + }, { name: "--no-output", }, @@ -755,6 +758,12 @@ const completion: Fig.Spec = { { name: "--memory-limit", }, + { + name: "--no-headers", + }, + { + name: "--delimiter", + }, { name: "--quiet", }, @@ -767,6 +776,12 @@ const completion: Fig.Spec = { { name: "extsort", options: [ + { + name: "--select", + }, + { + name: "--reverse", + }, { name: "--memory-limit", }, @@ -776,6 +791,9 @@ const completion: Fig.Spec = { { name: "--jobs", }, + { + name: "--delimiter", + }, { name: "--no-headers", }, diff --git a/contrib/completions/examples/qsv.fish b/contrib/completions/examples/qsv.fish index 3ff658281..15fe36705 100644 --- a/contrib/completions/examples/qsv.fish +++ b/contrib/completions/examples/qsv.fish @@ -301,15 +301,21 @@ complete -c qsv -n "__fish_qsv_using_subcommand exclude" -l output complete -c qsv -n "__fish_qsv_using_subcommand exclude" -l no-headers complete -c qsv -n "__fish_qsv_using_subcommand exclude" -l delimiter complete -c qsv -n "__fish_qsv_using_subcommand exclude" -s h -l help -d 'Print help' +complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -l select complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -l no-output complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -l dupes-output complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -l human-readable complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -l memory-limit +complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -l no-headers +complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -l delimiter complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -l quiet complete -c qsv -n "__fish_qsv_using_subcommand extdedup" -s h -l help -d 'Print help' +complete -c qsv -n "__fish_qsv_using_subcommand extsort" -l select +complete -c qsv -n "__fish_qsv_using_subcommand extsort" -l reverse complete -c qsv -n "__fish_qsv_using_subcommand extsort" -l memory-limit complete -c qsv -n "__fish_qsv_using_subcommand extsort" -l tmp-dir complete -c qsv -n "__fish_qsv_using_subcommand extsort" -l jobs +complete -c qsv -n "__fish_qsv_using_subcommand extsort" -l delimiter complete -c qsv -n "__fish_qsv_using_subcommand extsort" -l no-headers complete -c qsv -n "__fish_qsv_using_subcommand extsort" -s h -l help -d 'Print help' complete -c qsv -n "__fish_qsv_using_subcommand explode" -l rename diff --git a/contrib/completions/examples/qsv.nu b/contrib/completions/examples/qsv.nu index 87dd07b1d..d2a25f330 100644 --- a/contrib/completions/examples/qsv.nu +++ b/contrib/completions/examples/qsv.nu @@ -298,18 +298,24 @@ module completions { ] export extern "qsv extdedup" [ + --select --no-output --dupes-output --human-readable --memory-limit + --no-headers + --delimiter --quiet --help(-h) # Print help ] export extern "qsv extsort" [ + --select + --reverse --memory-limit --tmp-dir --jobs + --delimiter --no-headers --help(-h) # Print help ] diff --git a/contrib/completions/examples/qsv.ps1 b/contrib/completions/examples/qsv.ps1 index 6d4852baa..b44822a6f 100644 --- a/contrib/completions/examples/qsv.ps1 +++ b/contrib/completions/examples/qsv.ps1 @@ -415,19 +415,25 @@ Register-ArgumentCompleter -Native -CommandName 'qsv' -ScriptBlock { break } 'qsv;extdedup' { + [CompletionResult]::new('--select', 'select', [CompletionResultType]::ParameterName, 'select') [CompletionResult]::new('--no-output', 'no-output', [CompletionResultType]::ParameterName, 'no-output') [CompletionResult]::new('--dupes-output', 'dupes-output', [CompletionResultType]::ParameterName, 'dupes-output') [CompletionResult]::new('--human-readable', 'human-readable', [CompletionResultType]::ParameterName, 'human-readable') [CompletionResult]::new('--memory-limit', 'memory-limit', [CompletionResultType]::ParameterName, 'memory-limit') + [CompletionResult]::new('--no-headers', 'no-headers', [CompletionResultType]::ParameterName, 'no-headers') + [CompletionResult]::new('--delimiter', 'delimiter', [CompletionResultType]::ParameterName, 'delimiter') [CompletionResult]::new('--quiet', 'quiet', [CompletionResultType]::ParameterName, 'quiet') [CompletionResult]::new('-h', 'h', [CompletionResultType]::ParameterName, 'Print help') [CompletionResult]::new('--help', 'help', [CompletionResultType]::ParameterName, 'Print help') break } 'qsv;extsort' { + [CompletionResult]::new('--select', 'select', [CompletionResultType]::ParameterName, 'select') + [CompletionResult]::new('--reverse', 'reverse', [CompletionResultType]::ParameterName, 'reverse') [CompletionResult]::new('--memory-limit', 'memory-limit', [CompletionResultType]::ParameterName, 'memory-limit') [CompletionResult]::new('--tmp-dir', 'tmp-dir', [CompletionResultType]::ParameterName, 'tmp-dir') [CompletionResult]::new('--jobs', 'jobs', [CompletionResultType]::ParameterName, 'jobs') + [CompletionResult]::new('--delimiter', 'delimiter', [CompletionResultType]::ParameterName, 'delimiter') [CompletionResult]::new('--no-headers', 'no-headers', [CompletionResultType]::ParameterName, 'no-headers') [CompletionResult]::new('-h', 'h', [CompletionResultType]::ParameterName, 'Print help') [CompletionResult]::new('--help', 'help', [CompletionResultType]::ParameterName, 'Print help') diff --git a/contrib/completions/examples/qsv.zsh b/contrib/completions/examples/qsv.zsh index d2c54db7d..d7b6aa822 100644 --- a/contrib/completions/examples/qsv.zsh +++ b/contrib/completions/examples/qsv.zsh @@ -416,10 +416,13 @@ _arguments "${_arguments_options[@]}" : \ ;; (extdedup) _arguments "${_arguments_options[@]}" : \ +'--select[]' \ '--no-output[]' \ '--dupes-output[]' \ '--human-readable[]' \ '--memory-limit[]' \ +'--no-headers[]' \ +'--no-delimiter[]' \ '--quiet[]' \ '-h[Print help]' \ '--help[Print help]' \ @@ -427,9 +430,12 @@ _arguments "${_arguments_options[@]}" : \ ;; (extsort) _arguments "${_arguments_options[@]}" : \ +'--select[]' \ +'--reverse[]' \ '--memory-limit[]' \ '--tmp-dir[]' \ '--jobs[]' \ +'--delimiter[]' \ '--no-headers[]' \ '-h[Print help]' \ '--help[Print help]' \ diff --git a/contrib/completions/src/cmd/extdedup.rs b/contrib/completions/src/cmd/extdedup.rs index 8fd648911..d75b38656 100644 --- a/contrib/completions/src/cmd/extdedup.rs +++ b/contrib/completions/src/cmd/extdedup.rs @@ -2,10 +2,13 @@ use clap::{arg, Command}; pub fn extdedup_cmd() -> Command { Command::new("extdedup").args([ + arg!(--select), arg!(--"no-output"), arg!(--"dupes-output"), arg!(--"human-readable"), arg!(--"memory-limit"), + arg!(--"no-headers"), + arg!(--delimiter), arg!(--quiet), ]) } diff --git a/contrib/completions/src/cmd/extsort.rs b/contrib/completions/src/cmd/extsort.rs index b6f9a6866..730570438 100644 --- a/contrib/completions/src/cmd/extsort.rs +++ b/contrib/completions/src/cmd/extsort.rs @@ -2,9 +2,12 @@ use clap::{arg, Command}; pub fn extsort_cmd() -> Command { Command::new("extsort").args([ + arg!(--select), + arg!(--reverse), arg!(--"memory-limit"), arg!(--"tmp-dir"), arg!(--jobs), + arg!(--delimiter), arg!(--"no-headers"), ]) } From 330bdc5740281069c072a3f7e8c5013eb1f89f79 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 14 Oct 2024 06:47:02 -0400 Subject: [PATCH 046/119] `extsort`: perf refactor use atoi_simd for sort position parsing even though this hot loop is IO-bound, this approach is MUCH faster, simpler & concise --- src/cmd/extsort.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/cmd/extsort.rs b/src/cmd/extsort.rs index a26643b11..6969cdde5 100644 --- a/src/cmd/extsort.rs +++ b/src/cmd/extsort.rs @@ -152,7 +152,7 @@ fn sort_csv( // first pass. get the selected columns, and the record position // then write them to a temp text file with the selected columns and the position - // separated by "". Pad the position with leading zeroes, so it will always be the same width + // separated by "|". Pad the position with leading zeroes, so it will always be the same width for row in input_rdr.byte_records() { curr_row.clone_from(&row?); sort_key.clear(); @@ -167,7 +167,7 @@ fn sort_csv( } let idx_position = curr_row.position().unwrap(); - sort_key.push_str(&format!("{:01$}", idx_position.line(), width)); + sort_key.push_str(&format!("|{:01$}", idx_position.line(), width)); writeln!(line_wtr, "{sort_key}")?; } @@ -227,18 +227,12 @@ fn sort_csv( // amortize allocations let mut record_wrk = csv::ByteRecord::new(); let mut line = String::new(); - #[allow(unused_assignments)] - let mut line_parts: Vec<&str> = Vec::with_capacity(2); for l in sorted_line_rdr.lines() { line.clone_from(&l?); - line_parts = line.rsplitn(2, "").collect(); - if line_parts.len() != 2 { - return fail_clierror!("Invalid sorted line format"); - } - let position: u64 = line_parts[0] - .parse() - .map_err(|e| format!("Failed to retrieve position: {e}"))?; + let Ok(position) = atoi_simd::parse::((&line[line.len() - width..]).as_bytes()) else { + return fail!("Failed to retrieve position: invalid integer"); + }; idxfile.seek(position - position_delta)?; idxfile.read_byte_record(&mut record_wrk)?; From 8ea48b336cf5d6859cd1212c163c0c16bbff3764 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 14 Oct 2024 07:07:46 -0400 Subject: [PATCH 047/119] clippy:needless_borrow warning: this expression borrows a value the compiler would automatically borrow --> src/cmd/extsort.rs:233:52 | 233 | let Ok(position) = atoi_simd::parse::((&line[line.len() - width..]).as_bytes()) else { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: change this to: `line[line.len() - width..]` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrow = note: `#[warn(clippy::needless_borrow)]` on by default --- Cargo.lock | 4 ++-- src/cmd/extsort.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3dcb78fd3..e8e398eb1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6717,9 +6717,9 @@ dependencies = [ [[package]] name = "sorted-vec" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6734caf0b6f51addd5eeacca12fb39b2c6c14e8d4f3ac42f3a78955c0467458" +checksum = "2432f7c120a790d329ee1c54c9c09495b163a72eb338b0682742e2c173b15107" [[package]] name = "spin" diff --git a/src/cmd/extsort.rs b/src/cmd/extsort.rs index 6969cdde5..3491b6517 100644 --- a/src/cmd/extsort.rs +++ b/src/cmd/extsort.rs @@ -230,7 +230,7 @@ fn sort_csv( for l in sorted_line_rdr.lines() { line.clone_from(&l?); - let Ok(position) = atoi_simd::parse::((&line[line.len() - width..]).as_bytes()) else { + let Ok(position) = atoi_simd::parse::(line[line.len() - width..].as_bytes()) else { return fail!("Failed to retrieve position: invalid integer"); }; From f2918993cee6fafe6edb9fc4bd224c9359317e34 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:49:07 -0400 Subject: [PATCH 048/119] `docs`: qsv can wrangle more than CSVs... [skip ci] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1b2d1f110..2f42f4a04 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -## qsv: Blazing-fast CSV data-wrangling toolkit +## qsv: Blazing-fast Data-Wrangling toolkit [![Linux build status](https://github.com/jqnatividad/qsv/actions/workflows/rust.yml/badge.svg)](https://github.com/jqnatividad/qsv/actions/workflows/rust.yml) [![Windows build status](https://github.com/jqnatividad/qsv/actions/workflows/rust-windows.yml/badge.svg)](https://github.com/jqnatividad/qsv/actions/workflows/rust-windows.yml) @@ -17,7 +17,7 @@   | Table of Contents :--------------------------|:------------------------- -![qsv logo](docs/images/qsv-logo.png)
[_Hi-ho "Quicksilver" away!_](https://www.youtube.com/watch?v=p9lf76xOA5k)
[logo details](https://github.com/jqnatividad/qsv/discussions/295)
|qsv is a command line program for querying, slicing,
indexing, analyzing, filtering, enriching, transforming,
sorting, validating, joining & converting CSV files.
Commands are simple, composable & ___"blazing fast"___.

* [Commands](#available-commands)
* [Installation Options](#installation-options)
* [Whirlwind Tour](docs/whirlwind_tour.md#a-whirlwind-tour) / [Notebooks](contrib/notebooks/) / [Lessons & Exercises](https://100.dathere.com)
* [Cookbook](https://github.com/jqnatividad/qsv/wiki/Cookbook#cookbook)
* [FAQ](https://github.com/jqnatividad/qsv/discussions/categories/faq)
* [Performance Tuning](docs/PERFORMANCE.md#performance-tuning)
* 👉 [Benchmarks](https://qsv.dathere.com/benchmarks) 🚀
* [Environment Variables](docs/ENVIRONMENT_VARIABLES.md)
* [Feature Flags](#feature-flags)
* [Goals/Non-goals](#goals--non-goals)
* [Testing](#testing)
* [NYC School of Data 2022](https://docs.google.com/presentation/d/e/2PACX-1vQ12ndZL--gkz0HLQRaxqsNOwzddkv1iUKB3sq661yA77OPlAsmHJHpjaqt9s9QEf73VqMfb0cv4jHU/pub?start=false&loop=false&delayms=3000)/[csv,conf,v8](https://docs.google.com/presentation/d/10T_3MyIqS5UsKxJaOY7Ktrd-GfhJelQImlE_qYmtuis/edit#slide=id.g2e0f1e7aa0e_0_62) slides
* [Sponsor](#sponsor) +![qsv logo](docs/images/qsv-logo.png)
[_Hi-ho "Quicksilver" away!_](https://www.youtube.com/watch?v=p9lf76xOA5k)
[logo details](https://github.com/jqnatividad/qsv/discussions/295)
|qsv is a command line program for querying, slicing,
indexing, analyzing, filtering, enriching, transforming,
sorting, validating, joining & converting tabular data (CSV, spreadsheets, [etc.](#file-formats)).
Commands are simple, composable & ___"blazing fast"___.

* [Commands](#available-commands)
* [Installation Options](#installation-options)
* [Whirlwind Tour](docs/whirlwind_tour.md#a-whirlwind-tour) / [Notebooks](contrib/notebooks/) / [Lessons & Exercises](https://100.dathere.com)
* [Cookbook](https://github.com/jqnatividad/qsv/wiki/Cookbook#cookbook)
* [FAQ](https://github.com/jqnatividad/qsv/discussions/categories/faq)
* [Performance Tuning](docs/PERFORMANCE.md#performance-tuning)
* 👉 [Benchmarks](https://qsv.dathere.com/benchmarks) 🚀
* [Environment Variables](docs/ENVIRONMENT_VARIABLES.md)
* [Feature Flags](#feature-flags)
* [Goals/Non-goals](#goals--non-goals)
* [Testing](#testing)
* [NYC School of Data 2022](https://docs.google.com/presentation/d/e/2PACX-1vQ12ndZL--gkz0HLQRaxqsNOwzddkv1iUKB3sq661yA77OPlAsmHJHpjaqt9s9QEf73VqMfb0cv4jHU/pub?start=false&loop=false&delayms=3000)/[csv,conf,v8](https://docs.google.com/presentation/d/10T_3MyIqS5UsKxJaOY7Ktrd-GfhJelQImlE_qYmtuis/edit#slide=id.g2e0f1e7aa0e_0_62) slides
* [Sponsor](#sponsor)
From ea43b649ae4c36f84260c08bdeecf601830a0471 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:51:28 -0400 Subject: [PATCH 049/119] `docs`: add linebreak for better spacing [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f42f4a04..02c3ad9d8 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@   | Table of Contents :--------------------------|:------------------------- -![qsv logo](docs/images/qsv-logo.png)
[_Hi-ho "Quicksilver" away!_](https://www.youtube.com/watch?v=p9lf76xOA5k)
[logo details](https://github.com/jqnatividad/qsv/discussions/295)
|qsv is a command line program for querying, slicing,
indexing, analyzing, filtering, enriching, transforming,
sorting, validating, joining & converting tabular data (CSV, spreadsheets, [etc.](#file-formats)).
Commands are simple, composable & ___"blazing fast"___.

* [Commands](#available-commands)
* [Installation Options](#installation-options)
* [Whirlwind Tour](docs/whirlwind_tour.md#a-whirlwind-tour) / [Notebooks](contrib/notebooks/) / [Lessons & Exercises](https://100.dathere.com)
* [Cookbook](https://github.com/jqnatividad/qsv/wiki/Cookbook#cookbook)
* [FAQ](https://github.com/jqnatividad/qsv/discussions/categories/faq)
* [Performance Tuning](docs/PERFORMANCE.md#performance-tuning)
* 👉 [Benchmarks](https://qsv.dathere.com/benchmarks) 🚀
* [Environment Variables](docs/ENVIRONMENT_VARIABLES.md)
* [Feature Flags](#feature-flags)
* [Goals/Non-goals](#goals--non-goals)
* [Testing](#testing)
* [NYC School of Data 2022](https://docs.google.com/presentation/d/e/2PACX-1vQ12ndZL--gkz0HLQRaxqsNOwzddkv1iUKB3sq661yA77OPlAsmHJHpjaqt9s9QEf73VqMfb0cv4jHU/pub?start=false&loop=false&delayms=3000)/[csv,conf,v8](https://docs.google.com/presentation/d/10T_3MyIqS5UsKxJaOY7Ktrd-GfhJelQImlE_qYmtuis/edit#slide=id.g2e0f1e7aa0e_0_62) slides
* [Sponsor](#sponsor) +![qsv logo](docs/images/qsv-logo.png)
[_Hi-ho "Quicksilver" away!_](https://www.youtube.com/watch?v=p9lf76xOA5k)
[logo details](https://github.com/jqnatividad/qsv/discussions/295)
|qsv is a command line program for querying, slicing,
indexing, analyzing, filtering, enriching, transforming,
sorting, validating, joining & converting tabular data>
(CSV, spreadsheets, [etc.](#file-formats)).
Commands are simple, composable & ___"blazing fast"___.

* [Commands](#available-commands)
* [Installation Options](#installation-options)
* [Whirlwind Tour](docs/whirlwind_tour.md#a-whirlwind-tour) / [Notebooks](contrib/notebooks/) / [Lessons & Exercises](https://100.dathere.com)
* [Cookbook](https://github.com/jqnatividad/qsv/wiki/Cookbook#cookbook)
* [FAQ](https://github.com/jqnatividad/qsv/discussions/categories/faq)
* [Performance Tuning](docs/PERFORMANCE.md#performance-tuning)
* 👉 [Benchmarks](https://qsv.dathere.com/benchmarks) 🚀
* [Environment Variables](docs/ENVIRONMENT_VARIABLES.md)
* [Feature Flags](#feature-flags)
* [Goals/Non-goals](#goals--non-goals)
* [Testing](#testing)
* [NYC School of Data 2022](https://docs.google.com/presentation/d/e/2PACX-1vQ12ndZL--gkz0HLQRaxqsNOwzddkv1iUKB3sq661yA77OPlAsmHJHpjaqt9s9QEf73VqMfb0cv4jHU/pub?start=false&loop=false&delayms=3000)/[csv,conf,v8](https://docs.google.com/presentation/d/10T_3MyIqS5UsKxJaOY7Ktrd-GfhJelQImlE_qYmtuis/edit#slide=id.g2e0f1e7aa0e_0_62) slides
* [Sponsor](#sponsor)
From 54a9cf9295c67cd298d8dc286c874a2b86bc8ee2 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:52:18 -0400 Subject: [PATCH 050/119] typo [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 02c3ad9d8..dba160330 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@   | Table of Contents :--------------------------|:------------------------- -![qsv logo](docs/images/qsv-logo.png)
[_Hi-ho "Quicksilver" away!_](https://www.youtube.com/watch?v=p9lf76xOA5k)
[logo details](https://github.com/jqnatividad/qsv/discussions/295)
|qsv is a command line program for querying, slicing,
indexing, analyzing, filtering, enriching, transforming,
sorting, validating, joining & converting tabular data>
(CSV, spreadsheets, [etc.](#file-formats)).
Commands are simple, composable & ___"blazing fast"___.

* [Commands](#available-commands)
* [Installation Options](#installation-options)
* [Whirlwind Tour](docs/whirlwind_tour.md#a-whirlwind-tour) / [Notebooks](contrib/notebooks/) / [Lessons & Exercises](https://100.dathere.com)
* [Cookbook](https://github.com/jqnatividad/qsv/wiki/Cookbook#cookbook)
* [FAQ](https://github.com/jqnatividad/qsv/discussions/categories/faq)
* [Performance Tuning](docs/PERFORMANCE.md#performance-tuning)
* 👉 [Benchmarks](https://qsv.dathere.com/benchmarks) 🚀
* [Environment Variables](docs/ENVIRONMENT_VARIABLES.md)
* [Feature Flags](#feature-flags)
* [Goals/Non-goals](#goals--non-goals)
* [Testing](#testing)
* [NYC School of Data 2022](https://docs.google.com/presentation/d/e/2PACX-1vQ12ndZL--gkz0HLQRaxqsNOwzddkv1iUKB3sq661yA77OPlAsmHJHpjaqt9s9QEf73VqMfb0cv4jHU/pub?start=false&loop=false&delayms=3000)/[csv,conf,v8](https://docs.google.com/presentation/d/10T_3MyIqS5UsKxJaOY7Ktrd-GfhJelQImlE_qYmtuis/edit#slide=id.g2e0f1e7aa0e_0_62) slides
* [Sponsor](#sponsor) +![qsv logo](docs/images/qsv-logo.png)
[_Hi-ho "Quicksilver" away!_](https://www.youtube.com/watch?v=p9lf76xOA5k)
[logo details](https://github.com/jqnatividad/qsv/discussions/295)
|qsv is a command line program for querying, slicing,
indexing, analyzing, filtering, enriching, transforming,
sorting, validating, joining & converting tabular data
(CSV, spreadsheets, [etc.](#file-formats)).
Commands are simple, composable & ___"blazing fast"___.

* [Commands](#available-commands)
* [Installation Options](#installation-options)
* [Whirlwind Tour](docs/whirlwind_tour.md#a-whirlwind-tour) / [Notebooks](contrib/notebooks/) / [Lessons & Exercises](https://100.dathere.com)
* [Cookbook](https://github.com/jqnatividad/qsv/wiki/Cookbook#cookbook)
* [FAQ](https://github.com/jqnatividad/qsv/discussions/categories/faq)
* [Performance Tuning](docs/PERFORMANCE.md#performance-tuning)
* 👉 [Benchmarks](https://qsv.dathere.com/benchmarks) 🚀
* [Environment Variables](docs/ENVIRONMENT_VARIABLES.md)
* [Feature Flags](#feature-flags)
* [Goals/Non-goals](#goals--non-goals)
* [Testing](#testing)
* [NYC School of Data 2022](https://docs.google.com/presentation/d/e/2PACX-1vQ12ndZL--gkz0HLQRaxqsNOwzddkv1iUKB3sq661yA77OPlAsmHJHpjaqt9s9QEf73VqMfb0cv4jHU/pub?start=false&loop=false&delayms=3000)/[csv,conf,v8](https://docs.google.com/presentation/d/10T_3MyIqS5UsKxJaOY7Ktrd-GfhJelQImlE_qYmtuis/edit#slide=id.g2e0f1e7aa0e_0_62) slides
* [Sponsor](#sponsor)
From 3459af71c1d1e10baa5a57dc88f5c01e049e147c Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:29:56 -0400 Subject: [PATCH 051/119] typo branding - datHere not dathere [skip ci] --- src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index e68d0917f..e83b35a00 100644 --- a/src/config.rs +++ b/src/config.rs @@ -30,7 +30,7 @@ const NO_INDEX_WARNING_FILESIZE: u64 = 100_000_000; // 100MB static AUTO_INDEXED: AtomicBool = AtomicBool::new(false); pub static SPONSOR_MESSAGE: &str = r#"sponsored by datHere - Data Infrastructure Engineering (https://qsv.datHere.com) -Need a UI & more advanced data-wrangling? Upgrade to qsv pro (https://qsvpro.dathere.com) +Need a UI & more advanced data-wrangling? Upgrade to qsv pro (https://qsvpro.datHere.com) "#; #[derive(Clone, Copy, Debug, PartialEq, Eq)] From bf4f0dfb988714fea1f3dc9d0ab9b9ec0fbbf326 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:33:11 -0400 Subject: [PATCH 052/119] `deps`: bump to latest polars upstream --- Cargo.lock | 43 +++++++++++++++++++++---------------------- Cargo.toml | 4 ++-- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e8e398eb1..89753bc22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4750,7 +4750,7 @@ dependencies = [ [[package]] name = "polars" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "getrandom", "polars-arrow", @@ -4769,7 +4769,7 @@ dependencies = [ [[package]] name = "polars-arrow" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "atoi", @@ -4782,7 +4782,6 @@ dependencies = [ "either", "ethnum", "fast-float", - "futures", "getrandom", "hashbrown 0.15.0", "itoa", @@ -4817,7 +4816,7 @@ dependencies = [ [[package]] name = "polars-compute" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "bytemuck", "either", @@ -4832,7 +4831,7 @@ dependencies = [ [[package]] name = "polars-core" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4866,7 +4865,7 @@ dependencies = [ [[package]] name = "polars-error" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "avro-schema", "object_store", @@ -4879,7 +4878,7 @@ dependencies = [ [[package]] name = "polars-expr" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4899,7 +4898,7 @@ dependencies = [ [[package]] name = "polars-io" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "async-trait", @@ -4945,7 +4944,7 @@ dependencies = [ [[package]] name = "polars-json" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "chrono", @@ -4966,7 +4965,7 @@ dependencies = [ [[package]] name = "polars-lazy" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4992,7 +4991,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "futures", "memmap2", @@ -5013,7 +5012,7 @@ dependencies = [ [[package]] name = "polars-ops" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "argminmax", @@ -5046,7 +5045,7 @@ dependencies = [ [[package]] name = "polars-parquet" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "async-stream", @@ -5074,7 +5073,7 @@ dependencies = [ [[package]] name = "polars-pipe" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -5100,7 +5099,7 @@ dependencies = [ [[package]] name = "polars-plan" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "bitflags 2.6.0", @@ -5133,7 +5132,7 @@ dependencies = [ [[package]] name = "polars-row" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "bytemuck", "polars-arrow", @@ -5144,7 +5143,7 @@ dependencies = [ [[package]] name = "polars-schema" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "indexmap", "polars-error", @@ -5156,7 +5155,7 @@ dependencies = [ [[package]] name = "polars-sql" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "hex", "once_cell", @@ -5177,7 +5176,7 @@ dependencies = [ [[package]] name = "polars-time" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "atoi", "bytemuck", @@ -5197,7 +5196,7 @@ dependencies = [ [[package]] name = "polars-utils" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=ff10b38#ff10b3814ae91be40cf046d74d5f9088f7b33fdf" +source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" dependencies = [ "ahash", "bytemuck", @@ -6283,9 +6282,9 @@ dependencies = [ [[package]] name = "scc" -version = "2.2.1" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "553f8299af7450cda9a52d3a370199904e7a46b5ffd1bef187c4a6af3bb6db69" +checksum = "f2c1f7fc6deb21665a9060dfc7d271be784669295a31babdcd4dd2c79ae8cbfb" dependencies = [ "sdd", ] diff --git a/Cargo.toml b/Cargo.toml index 884247a52..fc8aa6d5d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -314,9 +314,9 @@ governor = { git = "https://github.com/jqnatividad/governor", branch = "deps-bum # BUILD NOTE: Be sure to set QSV_POLARS_REV below to the latest commit short hash or tag # of polars/py-polars before building qsv. This allows us to show the polars rev/tag in --version. # if we are using a release version of Rust Polars, leave QSV_POLARS_REV empty -# QSV_POLARS_REV=ff10b38 +# QSV_POLARS_REV=900dc3b # polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.9.0" } -polars = { git = "https://github.com/pola-rs/polars", rev = "ff10b38" } +polars = { git = "https://github.com/pola-rs/polars", rev = "900dc3b" } [features] default = ["mimalloc"] From 2179bb508673c494a31449f8382684e04d814ed8 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 14 Oct 2024 21:40:47 -0400 Subject: [PATCH 053/119] `deps`: use updated version of csvlens fork using the latest release of arrow with the lexical-core soundness issue addressed, so we no longer need to have a direct arrow dependency https://github.com/advisories/GHSA-2326-pfpj-vx3h --- Cargo.lock | 7 +++---- Cargo.toml | 7 ++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 89753bc22..0c876a03f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1791,7 +1791,7 @@ dependencies = [ [[package]] name = "csvlens" version = "0.10.1" -source = "git+https://github.com/jqnatividad/csvlens?branch=dependency-upgrades-lexical-core_fix#89e8bbf397017ce067bcb552f66658e3db4d939b" +source = "git+https://github.com/jqnatividad/csvlens?branch=bump-dependencies-especially-arrow#3fcedb92c3d800f0ecfc2427bbe8eee3ce33fcdd" dependencies = [ "anyhow", "arboard", @@ -5441,7 +5441,6 @@ dependencies = [ "actix-web", "ahash", "arboard", - "arrow", "assert-json-diff", "atoi_simd 0.16.0", "base62", @@ -6251,9 +6250,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" [[package]] name = "ryu" diff --git a/Cargo.toml b/Cargo.toml index fc8aa6d5d..9cfe02587 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,9 +70,6 @@ panic = "abort" [dependencies] ahash = "0.8" arboard = { version = "3.4.1", default-features = false } -arrow = { version = "53", default-features = false, features = [ - "csv", -], optional = true } atoi_simd = "0.16" base62 = { version = "2.0", optional = true } bytemuck = { version = "1.19", features = [ @@ -281,7 +278,7 @@ csv-core = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-opt csv-index = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-optimized" } # use our csvlens fork with latest dependencies, including arrow 53 upstream, with unreleased lexical-core fix -csvlens = { git = "https://github.com/jqnatividad/csvlens", branch = "dependency-upgrades-lexical-core_fix" } +csvlens = { git = "https://github.com/jqnatividad/csvlens", branch = "bump-dependencies-especially-arrow" } # modernized fork of crc32fast, 2021 edition, MSRV 1.81, select clippy lint suggestions applied # crc32fast = { git = "https://github.com/jqnatividad/rust-crc32fast", branch = "modernize" } @@ -375,7 +372,7 @@ polars = ["dep:polars", "bytemuck"] prompt = ["rfd"] python = ["pyo3"] to = ["csvs_convert", "xlsxwriter"] -lens = ["arrow", "csvlens"] +lens = ["csvlens"] lite = [] datapusher_plus = ["self_update"] feature_capable = [] From 94f3e04903a6af8b047960f98acbbf1bec511d7c Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 15 Oct 2024 04:48:56 -0400 Subject: [PATCH 054/119] `deps`: use latest version of our csvlens fork with bumped dependencies also bumped rustls-pki-types from 1.9.0 to 1.10.0 --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0c876a03f..e2ebd25ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1791,7 +1791,7 @@ dependencies = [ [[package]] name = "csvlens" version = "0.10.1" -source = "git+https://github.com/jqnatividad/csvlens?branch=bump-dependencies-especially-arrow#3fcedb92c3d800f0ecfc2427bbe8eee3ce33fcdd" +source = "git+https://github.com/jqnatividad/csvlens?branch=bump-dependencies-especially-arrow#5c5c438f73669bb287c88d0c9dbff5c58bba9cfa" dependencies = [ "anyhow", "arboard", @@ -6233,9 +6233,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" +checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" [[package]] name = "rustls-webpki" From be1bc90e1234ef81720c73ea83ad9cda0b236f15 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 15 Oct 2024 06:26:33 -0400 Subject: [PATCH 055/119] `deps`: use publicsuffix upstream with unreleased dependency bumps; alpha-sort patch section gets rid of old idna 0.3.0 dependency --- Cargo.lock | 16 +++------------- Cargo.toml | 13 +++++++++---- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2ebd25ff..e5f5ac525 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3216,16 +3216,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" -[[package]] -name = "idna" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - [[package]] name = "idna" version = "0.5.0" @@ -5363,10 +5353,9 @@ dependencies = [ [[package]] name = "publicsuffix" version = "2.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96a8c1bda5ae1af7f99a2962e49df150414a43d62404644d98dd5c3a93d07457" +source = "git+https://github.com/rushmorem/publicsuffix?rev=b300356#b30035623f75d8122c222058eff48835078276f4" dependencies = [ - "idna 0.3.0", + "idna 0.5.0", "psl-types", ] @@ -5501,6 +5490,7 @@ dependencies = [ "parking_lot 0.12.3", "phf 0.11.2", "polars", + "publicsuffix", "pyo3", "qsv-dateparser", "qsv-sniffer", diff --git a/Cargo.toml b/Cargo.toml index 9cfe02587..8bd0f2363 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -172,6 +172,7 @@ polars = { version = "0.43", features = [ "streaming", "timezones", ], optional = true } +publicsuffix = { version = "2.2", optional = true } pyo3 = { version = "0.22", features = [ "auto-initialize", "gil-refs", @@ -287,20 +288,23 @@ csvlens = { git = "https://github.com/jqnatividad/csvlens", branch = "bump-depen # see https://github.com/jan-auer/dynfmt/pull/9 dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" } +# use our fork of governor with bumped dashmap dependency +governor = { git = "https://github.com/jqnatividad/governor", branch = "deps-bump-dashmap" } + # needed to get latest dependencies and unreleased fixes grex = { git = "https://github.com/pemistahl/grex", rev = "ff8533d" } # use modernized version of local_encoding local-encoding = { git = "https://github.com/slonopotamus/local-encoding-rs", branch = "travis-madness" } -# use of fork of xlsxwriter with bumped bindgen dependency -xlsxwriter = { git = "https://github.com/jqnatividad/xlsxwriter-rs", branch = "bump-bindgen-to-0.70.1" } +# use upstream publicsuffix with unreleased dependency bumps +publicsuffix = { git = "https://github.com/rushmorem/publicsuffix", rev = "b300356" } # use serde_json with unreleased optimizations serde_json = { git = "https://github.com/serde-rs/json", rev = "faab2e8" } -# use our fork of governor with bumped dashmap dependency -governor = { git = "https://github.com/jqnatividad/governor", branch = "deps-bump-dashmap" } +# use of fork of xlsxwriter with bumped bindgen dependency +xlsxwriter = { git = "https://github.com/jqnatividad/xlsxwriter-rs", branch = "bump-bindgen-to-0.70.1" } # Polars has a much higher release tempo for its Python bindings compared # to its underlying Rust library. See https://github.com/pola-rs/polars/releases @@ -353,6 +357,7 @@ fetch = [ "governor", "hashbrown", "jql-runner", + "publicsuffix", "redis", "serde_stacker", "serde_urlencoded", From 356d2c02195928d6f7b7c6364cfdcd94b347e710 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:15:35 -0400 Subject: [PATCH 056/119] `deps`: use latest csvlens upstream with dependency fixes; also bumped hyper from 1.4.1 to 1.5.0 --- Cargo.lock | 6 +++--- Cargo.toml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e5f5ac525..7cb8e8d97 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1791,7 +1791,7 @@ dependencies = [ [[package]] name = "csvlens" version = "0.10.1" -source = "git+https://github.com/jqnatividad/csvlens?branch=bump-dependencies-especially-arrow#5c5c438f73669bb287c88d0c9dbff5c58bba9cfa" +source = "git+https://github.com/YS-L/csvlens?rev=002edeb#002edebeda69b0b81b4fd4060e25e80c5a87f4da" dependencies = [ "anyhow", "arboard", @@ -3013,9 +3013,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" dependencies = [ "bytes", "futures-channel", diff --git a/Cargo.toml b/Cargo.toml index 8bd0f2363..36c1d8934 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -278,8 +278,8 @@ csv = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-opt csv-core = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-optimized" } csv-index = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-optimized" } -# use our csvlens fork with latest dependencies, including arrow 53 upstream, with unreleased lexical-core fix -csvlens = { git = "https://github.com/jqnatividad/csvlens", branch = "bump-dependencies-especially-arrow" } +# use latest csvlens upstream with latest dependencies, including arrow 53, with lexical-core fix +csvlens = { git = "https://github.com/YS-L/csvlens", rev = "002edeb" } # modernized fork of crc32fast, 2021 edition, MSRV 1.81, select clippy lint suggestions applied # crc32fast = { git = "https://github.com/jqnatividad/rust-crc32fast", branch = "modernize" } From 68bfad32e601bdbb560282a14b8b94f4cb5789ec Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:06:35 -0400 Subject: [PATCH 057/119] `deps`: use our fork of strum/strum_macros --- Cargo.lock | 92 ++++++++++++------------------------------------------ Cargo.toml | 5 +++ 2 files changed, 25 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7cb8e8d97..24c6667d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1236,7 +1236,7 @@ checksum = "d59ae0466b83e838b81a54256c39d5d7c20b9d7daa10510a242d9b75abd5936e" dependencies = [ "chrono", "chrono-tz-build 0.2.1", - "phf 0.11.2", + "phf", ] [[package]] @@ -1247,7 +1247,7 @@ checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb" dependencies = [ "chrono", "chrono-tz-build 0.3.0", - "phf 0.11.2", + "phf", ] [[package]] @@ -1258,7 +1258,7 @@ checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" dependencies = [ "chrono", "chrono-tz-build 0.4.0", - "phf 0.11.2", + "phf", ] [[package]] @@ -1268,7 +1268,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" dependencies = [ "parse-zoneinfo", - "phf 0.11.2", + "phf", "phf_codegen", ] @@ -1279,7 +1279,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1" dependencies = [ "parse-zoneinfo", - "phf 0.11.2", + "phf", "phf_codegen", ] @@ -3253,9 +3253,9 @@ dependencies = [ [[package]] name = "impl-more" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d" +checksum = "e658178c10c747241199382079c0f195ce229866fbf4aa0d46fa6107fe33d2ec" [[package]] name = "indexmap" @@ -4560,25 +4560,14 @@ dependencies = [ "indexmap", ] -[[package]] -name = "phf" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" -dependencies = [ - "phf_macros 0.10.0", - "phf_shared 0.10.0", - "proc-macro-hack", -] - [[package]] name = "phf" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ - "phf_macros 0.11.2", - "phf_shared 0.11.2", + "phf_macros", + "phf_shared", ] [[package]] @@ -4587,18 +4576,8 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" dependencies = [ - "phf_generator 0.11.2", - "phf_shared 0.11.2", -] - -[[package]] -name = "phf_generator" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" -dependencies = [ - "phf_shared 0.10.0", - "rand", + "phf_generator", + "phf_shared", ] [[package]] @@ -4607,46 +4586,23 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" dependencies = [ - "phf_shared 0.11.2", + "phf_shared", "rand", ] -[[package]] -name = "phf_macros" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", - "proc-macro-hack", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "phf_macros" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" dependencies = [ - "phf_generator 0.11.2", - "phf_shared 0.11.2", + "phf_generator", + "phf_shared", "proc-macro2", "quote", "syn 2.0.79", ] -[[package]] -name = "phf_shared" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" -dependencies = [ - "siphasher", -] - [[package]] name = "phf_shared" version = "0.11.2" @@ -5320,12 +5276,6 @@ dependencies = [ "toml_edit", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.20+deprecated" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" - [[package]] name = "proc-macro2" version = "1.0.87" @@ -5488,7 +5438,7 @@ dependencies = [ "num_cpus", "odht", "parking_lot 0.12.3", - "phf 0.11.2", + "phf", "polars", "publicsuffix", "pyo3", @@ -6820,19 +6770,17 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +version = "0.26.2" +source = "git+https://github.com/jqnatividad/strum?branch=bump-phf-to-0.11#d60ce1afe338f4456a5c49ef7e51ee083330babe" dependencies = [ - "phf 0.10.1", + "phf", "strum_macros", ] [[package]] name = "strum_macros" version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +source = "git+https://github.com/jqnatividad/strum?branch=bump-phf-to-0.11#d60ce1afe338f4456a5c49ef7e51ee083330babe" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -7124,7 +7072,7 @@ dependencies = [ "log", "parking_lot 0.12.3", "percent-encoding", - "phf 0.11.2", + "phf", "pin-project-lite", "postgres-protocol", "postgres-types", diff --git a/Cargo.toml b/Cargo.toml index 36c1d8934..33d358f21 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -303,6 +303,11 @@ publicsuffix = { git = "https://github.com/rushmorem/publicsuffix", rev = "b3003 # use serde_json with unreleased optimizations serde_json = { git = "https://github.com/serde-rs/json", rev = "faab2e8" } +# use our fork of strum with bumped phf dependency. They're holding back as they have a conservative +# MSRV policy, while qsv has an aggressive MSRV policy to always require Rust stable +strum = { git = "https://github.com/jqnatividad/strum", branch = "bump-phf-to-0.11" } +strum_macros = { git = "https://github.com/jqnatividad/strum", branch = "bump-phf-to-0.11" } + # use of fork of xlsxwriter with bumped bindgen dependency xlsxwriter = { git = "https://github.com/jqnatividad/xlsxwriter-rs", branch = "bump-bindgen-to-0.70.1" } From 48d8c9211a0f7d28345ef516ecb34505e090b478 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 22:36:57 +0000 Subject: [PATCH 058/119] build(deps): bump pyo3 from 0.22.4 to 0.22.5 Bumps [pyo3](https://github.com/pyo3/pyo3) from 0.22.4 to 0.22.5. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.22.4...v0.22.5) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24c6667d7..6b2329ffc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5311,9 +5311,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00e89ce2565d6044ca31a3eb79a334c3a79a841120a98f64eea9f579564cb691" +checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" dependencies = [ "cfg-if", "indoc", @@ -5329,9 +5329,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8afbaf3abd7325e08f35ffb8deb5892046fcb2608b703db6a583a5ba4cea01e" +checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" dependencies = [ "once_cell", "target-lexicon", @@ -5339,9 +5339,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec15a5ba277339d04763f4c23d85987a5b08cbb494860be141e6a10a8eb88022" +checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" dependencies = [ "libc", "pyo3-build-config", @@ -5349,9 +5349,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e0f01b5364bcfbb686a52fc4181d412b708a68ed20c330db9fc8d2c2bf5a43" +checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -5361,9 +5361,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a09b550200e1e5ed9176976d0060cbc2ea82dc8515da07885e7b8153a85caacb" +checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" dependencies = [ "heck 0.5.0", "proc-macro2", From 788450e027dc379e76e6ed328fa200c28a971bf6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2024 06:04:36 +0000 Subject: [PATCH 059/119] build(deps): bump uuid from 1.10.0 to 1.11.0 Bumps [uuid](https://github.com/uuid-rs/uuid) from 1.10.0 to 1.11.0. - [Release notes](https://github.com/uuid-rs/uuid/releases) - [Commits](https://github.com/uuid-rs/uuid/compare/1.10.0...1.11.0) --- updated-dependencies: - dependency-name: uuid dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6b2329ffc..c3a28b544 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7427,9 +7427,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom", ] From 7d6ce5ec9675755abd5942a5e9e731592961700d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 16 Oct 2024 21:36:37 -0400 Subject: [PATCH 060/119] `apply` & `applydp`: ensure the parallel iterators are not too small to minimize parallelization overhead --- src/cmd/apply.rs | 1 + src/cmd/applydp.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/src/cmd/apply.rs b/src/cmd/apply.rs index 91c3fb1bb..fbd70b20a 100644 --- a/src/cmd/apply.rs +++ b/src/cmd/apply.rs @@ -598,6 +598,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // do actual apply command via Rayon parallel iterator batch .par_iter() + .with_min_len(1024) .map(|record_item| { let mut record = record_item.clone(); match apply_cmd { diff --git a/src/cmd/applydp.rs b/src/cmd/applydp.rs index 164520d65..f4e9215e7 100644 --- a/src/cmd/applydp.rs +++ b/src/cmd/applydp.rs @@ -417,6 +417,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // do actual applydp command via Rayon parallel iterator batch .par_iter() + .with_min_len(1024) .map(|record_item| { let mut record = record_item.clone(); match applydp_cmd { From 651e9989ed2f1da85b088ed14eabb17ce716a673 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 16 Oct 2024 21:44:48 -0400 Subject: [PATCH 061/119] `deps`: update lock file --- Cargo.lock | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c3a28b544..e81c374cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -612,9 +612,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.15" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e26a9844c659a2a293d239c7910b752f8487fe122c6c8bd1659bf85a6507c302" +checksum = "103db485efc3e41214fe4fda9f3dbeae2eb9082f48fd236e6095627a9422066e" dependencies = [ "brotli 7.0.0", "flate2", @@ -3240,9 +3240,9 @@ dependencies = [ [[package]] name = "image" -version = "0.25.2" +version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99314c8a2152b8ddb211f924cdae532d8c5e4c8bb54728e12fff1b0cd5963a10" +checksum = "d97eb9a8e0cd5b76afea91d7eecd5cf8338cd44ced04256cf1f800474b227c52" dependencies = [ "bytemuck", "byteorder-lite", @@ -3253,9 +3253,9 @@ dependencies = [ [[package]] name = "impl-more" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e658178c10c747241199382079c0f195ce229866fbf4aa0d46fa6107fe33d2ec" +checksum = "aae21c3177a27788957044151cc2800043d127acaa460a47ebb9b84dfa2c6aa0" [[package]] name = "indexmap" @@ -3544,7 +3544,7 @@ dependencies = [ "itertools", "log", "num-traits", - "ordered-float 4.3.0", + "ordered-float 4.4.0", "sorted-vec", "tracing", "tracing-subscriber", @@ -4398,9 +4398,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "4.3.0" +version = "4.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d501f1a72f71d3c063a6bbc8f7271fa73aa09fe5d6283b6571e2ed176a2537" +checksum = "83e7ccb95e240b7c9506a3d544f10d935e142cc90b0a1d56954fb44d89ad6b97" dependencies = [ "num-traits", ] @@ -5278,9 +5278,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.87" +version = "1.0.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a" +checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9" dependencies = [ "unicode-ident", ] @@ -6137,9 +6137,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.14" +version = "0.23.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" +checksum = "5fbb44d7acc4e873d613422379f69f237a1b141928c02f6bc6ccfddddc2d7993" dependencies = [ "once_cell", "ring", @@ -6770,8 +6770,8 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum" -version = "0.26.2" -source = "git+https://github.com/jqnatividad/strum?branch=bump-phf-to-0.11#d60ce1afe338f4456a5c49ef7e51ee083330babe" +version = "0.26.3" +source = "git+https://github.com/jqnatividad/strum?branch=bump-phf-to-0.11#33c1b0848634ea8e85daed31c19dac0c162f25bd" dependencies = [ "phf", "strum_macros", @@ -6780,7 +6780,7 @@ dependencies = [ [[package]] name = "strum_macros" version = "0.26.4" -source = "git+https://github.com/jqnatividad/strum?branch=bump-phf-to-0.11#d60ce1afe338f4456a5c49ef7e51ee083330babe" +source = "git+https://github.com/jqnatividad/strum?branch=bump-phf-to-0.11#33c1b0848634ea8e85daed31c19dac0c162f25bd" dependencies = [ "heck 0.5.0", "proc-macro2", From a5b818562d5db7d65f00e5acd2c8bf7d44bd869a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 17 Oct 2024 06:15:59 -0400 Subject: [PATCH 062/119] `validate`: specify min_len for parallel iterators --- src/cmd/validate.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index ef152f1ff..fae5fa1f7 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -695,6 +695,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // validation_results vector should have same row count and in same order as input CSV batch .par_iter() + .with_min_len(1024) .map(|record| do_json_validation(&header_types, header_len, record, &schema_compiled)) .collect_into_vec(&mut validation_results); From 495158f99e438be765d2d3d4a4552ca471842ee1 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 17 Oct 2024 09:48:06 -0400 Subject: [PATCH 063/119] `deps`: update governor fork with latest dependencies also update libc from 0.2.159 to 0.2.160 --- Cargo.lock | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e81c374cc..064af37cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2772,12 +2772,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "governor" version = "0.6.3" -source = "git+https://github.com/jqnatividad/governor?branch=deps-bump-dashmap#d84d2c6b8be907535f45d3212c835dd2923ee650" +source = "git+https://github.com/jqnatividad/governor?branch=deps-bump-dashmap#f93388e4e2109b529d78b42cc66fe12c5d787e8f" dependencies = [ "cfg-if", "dashmap", - "futures", + "futures-sink", "futures-timer", + "futures-util", "no-std-compat", "nonzero_ext", "parking_lot 0.12.3", @@ -3629,9 +3630,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.159" +version = "0.2.160" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +checksum = "f0b21006cd1874ae9e650973c565615676dc4a274c965bb0a73796dac838ce4f" [[package]] name = "libflate" From 4a1a31e383a3ec5c462e3369d2eb68dcd0df8aa7 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 17 Oct 2024 14:06:15 -0400 Subject: [PATCH 064/119] `deps`: set MSRV to 1.82; enable serde in polars also update lock file --- Cargo.lock | 8 ++++---- Cargo.toml | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 064af37cb..5d7e62cfe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3241,9 +3241,9 @@ dependencies = [ [[package]] name = "image" -version = "0.25.3" +version = "0.25.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97eb9a8e0cd5b76afea91d7eecd5cf8338cd44ced04256cf1f800474b227c52" +checksum = "bc144d44a31d753b02ce64093d532f55ff8dc4ebf2ffb8a63c0dda691385acae" dependencies = [ "bytemuck", "byteorder-lite", @@ -5193,9 +5193,9 @@ checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" [[package]] name = "portable-atomic-util" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcdd8420072e66d54a407b3316991fe946ce3ab1083a7f575b2463866624704d" +checksum = "90a7d5beecc52a491b54d6dd05c7a45ba1801666a5baad9fdbfc6fef8d2d206c" dependencies = [ "portable-atomic", ] diff --git a/Cargo.toml b/Cargo.toml index 33d358f21..ca4f7635c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ categories = ["command-line-utilities", "parser-implementations"] license = "MIT OR Unlicense" autotests = false edition = "2021" -rust-version = "1.81" +rust-version = "1.82" autobins = false include = [ "src/**/*", @@ -167,6 +167,7 @@ polars = { version = "0.43", features = [ "performant", "pivot", "semi_anti_join", + "serde", "serde-lazy", "sql", "streaming", From d5806a2e43f4d69ddfbb0e15293f4614bd683576 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 17 Oct 2024 21:57:20 -0400 Subject: [PATCH 065/119] `deps`: use release version of serde_json now that 0.129 has been released also update libc from 0.2.160 to 0.2.161 --- Cargo.lock | 9 +++++---- Cargo.toml | 3 --- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5d7e62cfe..614bde5d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3630,9 +3630,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.160" +version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0b21006cd1874ae9e650973c565615676dc4a274c965bb0a73796dac838ce4f" +checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "libflate" @@ -6358,8 +6358,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.128" -source = "git+https://github.com/serde-rs/json?rev=faab2e8#faab2e8d2fcf781a3f77f329df836ffb3aaacfba" +version = "1.0.129" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbcf9b78a125ee667ae19388837dd12294b858d101fdd393cb9d5501ef09eb2" dependencies = [ "indexmap", "itoa", diff --git a/Cargo.toml b/Cargo.toml index ca4f7635c..b7bab0eb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -301,9 +301,6 @@ local-encoding = { git = "https://github.com/slonopotamus/local-encoding-rs", br # use upstream publicsuffix with unreleased dependency bumps publicsuffix = { git = "https://github.com/rushmorem/publicsuffix", rev = "b300356" } -# use serde_json with unreleased optimizations -serde_json = { git = "https://github.com/serde-rs/json", rev = "faab2e8" } - # use our fork of strum with bumped phf dependency. They're holding back as they have a conservative # MSRV policy, while qsv has an aggressive MSRV policy to always require Rust stable strum = { git = "https://github.com/jqnatividad/strum", branch = "bump-phf-to-0.11" } From 19c187b32106cada368dac19e161db1720d5bc5a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 17 Oct 2024 22:01:22 -0400 Subject: [PATCH 066/119] `docs`: MSRV is 1.82.0 [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dba160330..3d2365d5e 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ [![Crates.io](https://img.shields.io/crates/v/qsv.svg?logo=crates.io)](https://crates.io/crates/qsv) [![Crates.io downloads](https://img.shields.io/crates/d/qsv?color=orange&label=crates.io%20downloads)](https://crates.io/crates/qsv) [![Prebuilt Downloads](https://img.shields.io/github/downloads/jqnatividad/qsv/total?logo=github&label=prebuilt%20downloads)](https://github.com/jqnatividad/qsv/releases/latest) -[![Minimum supported Rust version](https://img.shields.io/badge/Rust-1.81.0-red?logo=rust)](#minimum-supported-rust-version) +[![Minimum supported Rust version](https://img.shields.io/badge/Rust-1.82.0-red?logo=rust)](#minimum-supported-rust-version) [![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2Fjqnatividad%2Fqsv.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2Fjqnatividad%2Fqsv?ref=badge_shield)
From 0d576054c792cb1769eba6a0900faf8234973d29 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 02:01:48 +0000 Subject: [PATCH 067/119] build(deps): bump csvs_convert from 0.8.14 to 0.9.0 Bumps [csvs_convert](https://github.com/kindly/csvs_convert) from 0.8.14 to 0.9.0. - [Changelog](https://github.com/kindly/csvs_convert/blob/main/changelog.md) - [Commits](https://github.com/kindly/csvs_convert/commits) --- updated-dependencies: - dependency-name: csvs_convert dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 16 +++++++++++++--- Cargo.toml | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 614bde5d4..41021ff31 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1810,9 +1810,9 @@ dependencies = [ [[package]] name = "csvs_convert" -version = "0.8.14" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e92b2974fd0f90b40372394e5a74e669d72dd133d01a36b1891e4514364af846" +checksum = "dab8874aa7b5cabd1257053086dcb7588b1fef40b6d56cdb2c9c59cf56ecaa48" dependencies = [ "chrono", "counter", @@ -1829,6 +1829,7 @@ dependencies = [ "rand", "regex", "rusqlite", + "rust_xlsxwriter", "serde_json", "snafu 0.8.5", "streaming-stats", @@ -1837,7 +1838,6 @@ dependencies = [ "threadpool", "typed-builder", "walkdir", - "xlsxwriter", "zip", ] @@ -6096,6 +6096,16 @@ dependencies = [ "num-traits", ] +[[package]] +name = "rust_xlsxwriter" +version = "0.79.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7559b53ef12183c895f2433822b1a711bfdc7e2979d41788272cd5c43934625c" +dependencies = [ + "tempfile", + "zip", +] + [[package]] name = "rustc-demangle" version = "0.1.24" diff --git a/Cargo.toml b/Cargo.toml index b7bab0eb0..0fd21c6d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,7 +94,7 @@ csv-core = "0.1" csv-diff = "0.1.0" csv-index = "0.1" csvlens = { version = "0.10", optional = true } -csvs_convert = { version = "0.8.14", default-features = false, features = [ +csvs_convert = { version = "0.9.0", default-features = false, features = [ "converters", ], optional = true } data-encoding = { version = "2.6", optional = true } From 1ee9ec38447bbd8f37e686cfe5f73f6e13b5ded7 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 17 Oct 2024 22:09:01 -0400 Subject: [PATCH 068/119] `deps`: remove patched xlsxwriter dependency now that csvs_convert 0.9.0 has switched to rust_xlsxwriter --- Cargo.lock | 83 ++---------------------------------------------------- Cargo.toml | 6 +--- 2 files changed, 4 insertions(+), 85 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 41021ff31..c7e1ffbd9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -873,26 +873,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bindgen" -version = "0.70.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" -dependencies = [ - "bitflags 2.6.0", - "cexpr", - "clang-sys", - "itertools", - "log", - "prettyplease", - "proc-macro2", - "quote", - "regex", - "rustc-hash 1.1.0", - "shlex", - "syn 2.0.79", -] - [[package]] name = "bit-set" version = "0.5.3" @@ -1186,15 +1166,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "cexpr" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" -dependencies = [ - "nom", -] - [[package]] name = "cfg-if" version = "1.0.0" @@ -1221,10 +1192,8 @@ checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" dependencies = [ "android-tzdata", "iana-time-zone", - "js-sys", "num-traits", "serde", - "wasm-bindgen", "windows-targets 0.52.6", ] @@ -1312,17 +1281,6 @@ dependencies = [ "inout", ] -[[package]] -name = "clang-sys" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" -dependencies = [ - "glob", - "libc", - "libloading", -] - [[package]] name = "clap" version = "4.5.20" @@ -3702,15 +3660,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "libxlsxwriter-sys" -version = "1.1.7" -source = "git+https://github.com/jqnatividad/xlsxwriter-rs?branch=bump-bindgen-to-0.70.1#671ac451bc3c4dfa9235df11c2afa338eec70c7f" -dependencies = [ - "bindgen", - "cc", -] - [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -3961,7 +3910,7 @@ dependencies = [ "mlua-sys", "num-traits", "once_cell", - "rustc-hash 2.0.0", + "rustc-hash", "serde", "serde-value", ] @@ -5258,16 +5207,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "prettyplease" -version = "0.2.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba" -dependencies = [ - "proc-macro2", - "syn 2.0.79", -] - [[package]] name = "proc-macro-crate" version = "3.2.0" @@ -5485,7 +5424,6 @@ dependencies = [ "url", "uuid", "whatlang", - "xlsxwriter", "xxhash-rust", ] @@ -5629,7 +5567,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.0.0", + "rustc-hash", "rustls", "socket2", "thiserror", @@ -5646,7 +5584,7 @@ dependencies = [ "bytes", "rand", "ring", - "rustc-hash 2.0.0", + "rustc-hash", "rustls", "slab", "thiserror", @@ -6112,12 +6050,6 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustc-hash" version = "2.0.0" @@ -8149,15 +8081,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "xlsxwriter" -version = "0.6.1" -source = "git+https://github.com/jqnatividad/xlsxwriter-rs?branch=bump-bindgen-to-0.70.1#671ac451bc3c4dfa9235df11c2afa338eec70c7f" -dependencies = [ - "chrono", - "libxlsxwriter-sys", -] - [[package]] name = "xxhash-rust" version = "0.8.12" diff --git a/Cargo.toml b/Cargo.toml index 0fd21c6d2..bec5c7bda 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -238,7 +238,6 @@ tokio = { version = "1", features = ["rt-multi-thread"] } uuid = { version = "1", features = ["v4", "v7"] } url = "2.5" whatlang = { version = "0.16", optional = true } -xlsxwriter = { version = "0.6", optional = true } xxhash-rust = { version = "0.8", features = ["xxh3"] } # enable parking_lot hardware lock elision on x86_64 @@ -306,9 +305,6 @@ publicsuffix = { git = "https://github.com/rushmorem/publicsuffix", rev = "b3003 strum = { git = "https://github.com/jqnatividad/strum", branch = "bump-phf-to-0.11" } strum_macros = { git = "https://github.com/jqnatividad/strum", branch = "bump-phf-to-0.11" } -# use of fork of xlsxwriter with bumped bindgen dependency -xlsxwriter = { git = "https://github.com/jqnatividad/xlsxwriter-rs", branch = "bump-bindgen-to-0.70.1" } - # Polars has a much higher release tempo for its Python bindings compared # to its underlying Rust library. See https://github.com/pola-rs/polars/releases # It's qsv's policy to use the latest upstream of polars/py-polars @@ -379,7 +375,7 @@ luau = ["mlua", "sanitize-filename", "simple-expand-tilde"] polars = ["dep:polars", "bytemuck"] prompt = ["rfd"] python = ["pyo3"] -to = ["csvs_convert", "xlsxwriter"] +to = ["csvs_convert"] lens = ["csvlens"] lite = [] datapusher_plus = ["self_update"] From c452d94ec5552ee2b2a4bf43a14c5697e50d5927 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 17 Oct 2024 23:25:11 -0400 Subject: [PATCH 069/119] `docs`: `.ssv` file auto-delimiter support [skip ci] --- docs/ENVIRONMENT_VARIABLES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ENVIRONMENT_VARIABLES.md b/docs/ENVIRONMENT_VARIABLES.md index ed0a6ea05..d7e1411eb 100644 --- a/docs/ENVIRONMENT_VARIABLES.md +++ b/docs/ENVIRONMENT_VARIABLES.md @@ -3,7 +3,7 @@ | Variable | Description | | --- | --- | | `QSV_DOTENV_PATH` | The full pathname of the dotenv file to load, OVERRIDING existing environment variables. This takes precedence over any other dotenv files in the filesystem. | -| `QSV_DEFAULT_DELIMITER` | single ascii character to use as delimiter. Overrides `--delimiter` option. Defaults to "," (comma) for CSV files & "\t" (tab) for TSV files when not set. Note that this will also set the delimiter for qsv's output to stdout.
However, using the `--output` option, regardless of this environment variable, will automatically change the delimiter used in the generated file based on the file extension - i.e. comma for `.csv`, tab for `.tsv` & `.tab` files. | +| `QSV_DEFAULT_DELIMITER` | single ascii character to use as delimiter. Overrides `--delimiter` option. Defaults to "," (comma) for CSV files & "\t" (tab) for TSV files when not set. Note that this will also set the delimiter for qsv's output to stdout.
However, using the `--output` option, regardless of this environment variable, will automatically change the delimiter used in the generated file based on the file extension - i.e. comma for `.csv`; tab for `.tsv` & `.tab` ; and semicolon for `.ssv` files | | `QSV_SNIFF_DELIMITER` | if set, the delimiter is automatically detected. Overrides `QSV_DEFAULT_DELIMITER` & `--delimiter` option. Note that this does not work with stdin. | | `QSV_NO_HEADERS` | if set, the first row will **NOT** be interpreted as headers. Supersedes `QSV_TOGGLE_HEADERS`. | | `QSV_TOGGLE_HEADERS` | if set to `1`, toggles header setting - i.e. inverts qsv header behavior, with no headers being the default, & setting `--no-headers` will actually mean headers will not be ignored. | From 803721edd961719d3e36bc4d3e5d85241d2bbbcb Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 18 Oct 2024 00:13:36 -0400 Subject: [PATCH 070/119] `sqlp`: fix read_csv optimization the if condition to check if there were no sqlp options passed was faulty. This caused `sqlp` to ALWAYS use the slow path, rather than using the fast path read_csv optimization when no options were passed Also removed the deeplink to polars-sql source code and just point to Polars SQL reference manual. The manual is maintained and shows the Polars SQL syntax, even though its targeted to py-polars users. --- src/cmd/sqlp.rs | 11 ++++------- tests/test_sqlp.rs | 3 ++- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/cmd/sqlp.rs b/src/cmd/sqlp.rs index a45c8e751..9b06155c4 100644 --- a/src/cmd/sqlp.rs +++ b/src/cmd/sqlp.rs @@ -6,11 +6,8 @@ Polars SQL is a SQL dialect, converting SQL queries to fast Polars LazyFrame exp (see https://docs.pola.rs/user-guide/sql/intro/). For a list of SQL functions and keywords supported by Polars SQL, see -https://github.com/pola-rs/polars/blob/ee9bafbdef7d62baa06d469f42e6cec0755eb544/crates/polars-sql/src/functions.rs#L32 and -https://github.com/pola-rs/polars/blob/ee9bafbdef7d62baa06d469f42e6cec0755eb544/crates/polars-sql/src/keywords.rs. -https://docs.pola.rs/py-polars/html/reference/sql/index.html also provides a more readable -version of the SQL functions and keywords, though be aware that it's for the Python version -of Polars, so there will be some minor syntax differences. +https://docs.pola.rs/py-polars/html/reference/sql/index.html though be aware that it's for +the Python version of Polars, so there will be some minor syntax differences. Returns the shape of the query result (number of rows, number of columns) to stderr. @@ -679,12 +676,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> { && delim == b',' && !args.flag_no_optimizations && !args.flag_try_parsedates - && args.flag_infer_len != 10_000 + && args.flag_infer_len == 10_000 // make sure this matches the usage text default && !args.flag_streaming && !args.flag_low_memory && !args.flag_truncate_ragged_lines && !args.flag_ignore_errors - && args.flag_rnull_values.is_empty() + && rnull_values == vec![PlSmallStr::EMPTY] && !args.flag_decimal_comma && comment_char.is_none() && std::path::Path::new(&args.arg_input[0]) diff --git a/tests/test_sqlp.rs b/tests/test_sqlp.rs index 7a221bb38..70258492d 100644 --- a/tests/test_sqlp.rs +++ b/tests/test_sqlp.rs @@ -1532,7 +1532,7 @@ fn sqlp_length_fns() { let expected = vec![ svec!["words", "n_chrs1", "n_chrs2", "n_chrs3", "n_bytes", "n_bits"], svec!["Cafe", "4", "4", "4", "4", "32"], - svec!["", "", "", "", "", ""], + svec!["", "0", "0", "0", "0", "0"], svec!["東京", "2", "2", "2", "6", "48"], ]; @@ -1666,6 +1666,7 @@ fn sqlp_string_replace() { let expected = vec![ svec!["words"], svec!["English breakfast tea is the best tea"], + svec![""], ]; assert_eq!(got, expected); From 98d35f9166951b6af6c8ec5222845e217a93dc5c Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 18 Oct 2024 00:19:57 -0400 Subject: [PATCH 071/119] `deps`: use latest polars upstream --- Cargo.lock | 60 +++++++++++++++++++++++++++--------------------------- Cargo.toml | 4 ++-- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7e1ffbd9..fbc68aa83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4441,16 +4441,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "parquet-format-safe" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1131c54b167dd4e4799ce762e1ab01549ebb94d5bdd13e6ec1b467491c378e1f" -dependencies = [ - "async-trait", - "futures", -] - [[package]] name = "parse-zoneinfo" version = "0.3.1" @@ -4646,7 +4636,7 @@ dependencies = [ [[package]] name = "polars" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "getrandom", "polars-arrow", @@ -4665,7 +4655,7 @@ dependencies = [ [[package]] name = "polars-arrow" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "atoi", @@ -4712,7 +4702,7 @@ dependencies = [ [[package]] name = "polars-compute" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "bytemuck", "either", @@ -4727,7 +4717,7 @@ dependencies = [ [[package]] name = "polars-core" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4761,7 +4751,7 @@ dependencies = [ [[package]] name = "polars-error" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "avro-schema", "object_store", @@ -4774,7 +4764,7 @@ dependencies = [ [[package]] name = "polars-expr" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4794,7 +4784,7 @@ dependencies = [ [[package]] name = "polars-io" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "async-trait", @@ -4840,7 +4830,7 @@ dependencies = [ [[package]] name = "polars-json" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "chrono", @@ -4861,7 +4851,7 @@ dependencies = [ [[package]] name = "polars-lazy" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4887,7 +4877,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "futures", "memmap2", @@ -4908,7 +4898,7 @@ dependencies = [ [[package]] name = "polars-ops" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "argminmax", @@ -4941,7 +4931,7 @@ dependencies = [ [[package]] name = "polars-parquet" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "async-stream", @@ -4954,10 +4944,10 @@ dependencies = [ "hashbrown 0.15.0", "lz4", "num-traits", - "parquet-format-safe", "polars-arrow", "polars-compute", "polars-error", + "polars-parquet-format", "polars-utils", "serde", "simdutf8", @@ -4966,10 +4956,19 @@ dependencies = [ "zstd", ] +[[package]] +name = "polars-parquet-format" +version = "2.10.0" +source = "git+https://github.com/pola-rs/parquet-format#b96e00d2b054739ee02da06987bcd7f44b82a4ef" +dependencies = [ + "async-trait", + "futures", +] + [[package]] name = "polars-pipe" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -4995,7 +4994,7 @@ dependencies = [ [[package]] name = "polars-plan" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "bitflags 2.6.0", @@ -5007,6 +5006,7 @@ dependencies = [ "futures", "hashbrown 0.15.0", "memmap2", + "num-traits", "once_cell", "percent-encoding", "polars-arrow", @@ -5028,7 +5028,7 @@ dependencies = [ [[package]] name = "polars-row" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "bytemuck", "polars-arrow", @@ -5039,7 +5039,7 @@ dependencies = [ [[package]] name = "polars-schema" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "indexmap", "polars-error", @@ -5051,7 +5051,7 @@ dependencies = [ [[package]] name = "polars-sql" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "hex", "once_cell", @@ -5072,7 +5072,7 @@ dependencies = [ [[package]] name = "polars-time" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "atoi", "bytemuck", @@ -5092,7 +5092,7 @@ dependencies = [ [[package]] name = "polars-utils" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=900dc3b#900dc3b60c0ba050d9b19c936b772e101fda830e" +source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" dependencies = [ "ahash", "bytemuck", diff --git a/Cargo.toml b/Cargo.toml index bec5c7bda..4835643d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -314,9 +314,9 @@ strum_macros = { git = "https://github.com/jqnatividad/strum", branch = "bump-ph # BUILD NOTE: Be sure to set QSV_POLARS_REV below to the latest commit short hash or tag # of polars/py-polars before building qsv. This allows us to show the polars rev/tag in --version. # if we are using a release version of Rust Polars, leave QSV_POLARS_REV empty -# QSV_POLARS_REV=900dc3b +# QSV_POLARS_REV=01a4e06 # polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.9.0" } -polars = { git = "https://github.com/pola-rs/polars", rev = "900dc3b" } +polars = { git = "https://github.com/pola-rs/polars", rev = "01a4e06" } [features] default = ["mimalloc"] From 41359ca0fc5f138a1bec29b7126550deb0211c31 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 18 Oct 2024 01:36:50 -0400 Subject: [PATCH 072/119] `deps`: use latest tweaked csv fork --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fbc68aa83..7e0aaf3a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1692,7 +1692,7 @@ dependencies = [ [[package]] name = "csv" version = "1.3.0" -source = "git+https://github.com/jqnatividad/rust-csv?branch=qsv-optimized#8cf30edc2ec3eb701cf708504a6ff54bd217afea" +source = "git+https://github.com/jqnatividad/rust-csv?branch=qsv-optimized#f2c22500da5b0faf0b3a734876b47031b0f80b57" dependencies = [ "csv-core", "itoa", @@ -1704,7 +1704,7 @@ dependencies = [ [[package]] name = "csv-core" version = "0.1.11" -source = "git+https://github.com/jqnatividad/rust-csv?branch=qsv-optimized#8cf30edc2ec3eb701cf708504a6ff54bd217afea" +source = "git+https://github.com/jqnatividad/rust-csv?branch=qsv-optimized#f2c22500da5b0faf0b3a734876b47031b0f80b57" dependencies = [ "memchr", ] @@ -1727,7 +1727,7 @@ dependencies = [ [[package]] name = "csv-index" version = "0.1.6" -source = "git+https://github.com/jqnatividad/rust-csv?branch=qsv-optimized#8cf30edc2ec3eb701cf708504a6ff54bd217afea" +source = "git+https://github.com/jqnatividad/rust-csv?branch=qsv-optimized#f2c22500da5b0faf0b3a734876b47031b0f80b57" dependencies = [ "byteorder", "csv", From 179e051ae677388622d8130281b613aee9fe03e2 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:58:50 -0400 Subject: [PATCH 073/119] `stats`: add note about validating input CSV files in case of `stats` errors [skip ci] --- src/cmd/stats.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 4896da189..6d2001d95 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -1,5 +1,9 @@ static USAGE: &str = r#" -Compute summary statistics & infers data types for each column in a CSV. +Compute summary statistics & infers data types for each column in a CSV. + +> NOTE: `stats` is heavily optimized for speed. It assumes the CSV is well-formed and +UTF-8 encoded. If you encounter problems generating stats, use `qsv validate` to confirm the +input CSV is valid. Summary statistics includes sum, min/max/range, sort order, min/max/sum/avg length, mean, standard error of the mean (SEM), stddev, variance, coefficient of variation (CV), nullcount, From c216675af64357cf6aa3da61a362185c7c4462e7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 14:43:13 +0000 Subject: [PATCH 074/119] build(deps): bump redis from 0.27.4 to 0.27.5 Bumps [redis](https://github.com/redis-rs/redis-rs) from 0.27.4 to 0.27.5. - [Release notes](https://github.com/redis-rs/redis-rs/releases) - [Commits](https://github.com/redis-rs/redis-rs/compare/redis-0.27.4...redis-0.27.5) --- updated-dependencies: - dependency-name: redis dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e0aaf3a7..9d41a648a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5393,7 +5393,7 @@ dependencies = [ "rand_hc", "rand_xoshiro", "rayon", - "redis 0.27.4", + "redis 0.27.5", "regex", "reqwest", "rfd", @@ -5784,9 +5784,9 @@ dependencies = [ [[package]] name = "redis" -version = "0.27.4" +version = "0.27.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc6baebe319ef5e4b470f248335620098d1c2e9261e995be05f56f719ca4bdb2" +checksum = "81cccf17a692ce51b86564334614d72dcae1def0fd5ecebc9f02956da74352b5" dependencies = [ "ahash", "arc-swap", From d42716735fbf32e00c57b02a6716ed88f359f506 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 17:53:02 +0000 Subject: [PATCH 075/119] build(deps): bump serde_json from 1.0.129 to 1.0.130 Bumps [serde_json](https://github.com/serde-rs/json) from 1.0.129 to 1.0.130. - [Release notes](https://github.com/serde-rs/json/releases) - [Commits](https://github.com/serde-rs/json/compare/1.0.129...1.0.130) --- updated-dependencies: - dependency-name: serde_json dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9d41a648a..0eb08ce2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6300,9 +6300,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.129" +version = "1.0.130" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dbcf9b78a125ee667ae19388837dd12294b858d101fdd393cb9d5501ef09eb2" +checksum = "610f75ff4a8e3cb29b85da56eabdd1bff5b06739059a4b8e2967fef32e5d9944" dependencies = [ "indexmap", "itoa", From c9baadcb6f3b13aa5dfaa4b9682a93ea9783a42b Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 07:38:23 -0400 Subject: [PATCH 076/119] `deps`: add smallvec crate also update lock file --- Cargo.lock | 20 +++++++++----------- Cargo.toml | 1 + 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0eb08ce2a..5675d456a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -311,9 +311,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.89" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" +checksum = "37bf3594c4c988a53154954629820791dde498571819ae4ca50ca811e060cc95" [[package]] name = "arbitrary" @@ -1148,9 +1148,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.30" +version = "1.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" +checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" dependencies = [ "jobserver", "libc", @@ -5410,6 +5410,7 @@ dependencies = [ "simd-json", "simdutf8", "simple-expand-tilde", + "smallvec", "snap", "strsim", "strum", @@ -6300,9 +6301,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.130" +version = "1.0.131" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "610f75ff4a8e3cb29b85da56eabdd1bff5b06739059a4b8e2967fef32e5d9944" +checksum = "67d42a0bd4ac281beff598909bb56a86acaf979b84483e1c79c10dcaf98f8cf3" dependencies = [ "indexmap", "itoa", @@ -7255,12 +7256,9 @@ dependencies = [ [[package]] name = "unicase" -version = "2.7.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" -dependencies = [ - "version_check", -] +checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df" [[package]] name = "unicode-bidi" diff --git a/Cargo.toml b/Cargo.toml index 4835643d6..352241129 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -224,6 +224,7 @@ serde_json = { version = "1", features = ["preserve_order"] } serde_stacker = { version = "0.1", optional = true } serde_urlencoded = { version = "0.7", optional = true } simple-expand-tilde = { version = "0.4.3", optional = true } +smallvec = "1" snap = "1" strsim = { version = "0.11", optional = true } strum = { version = "0.26", features = ["phf"] } From 0450193a70593f94e28b7e1fb1da96c5d665eb4a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 07:40:16 -0400 Subject: [PATCH 077/119] `apply` & `applydp`: use smallvec for operations_vec which is normally very small so we can store it inline rather than allocating on the heap --- src/cmd/apply.rs | 9 +++++---- src/cmd/applydp.rs | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/cmd/apply.rs b/src/cmd/apply.rs index fbd70b20a..12b6562f1 100644 --- a/src/cmd/apply.rs +++ b/src/cmd/apply.rs @@ -321,6 +321,7 @@ use rayon::{ }; use regex::Regex; use serde::Deserialize; +use smallvec::SmallVec; use strsim::{ damerau_levenshtein, hamming, jaro_winkler, normalized_damerau_levenshtein, osa_distance, sorensen_dice, @@ -508,7 +509,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { String::new() }; - let mut ops_vec: Vec = Vec::new(); + let mut ops_vec = SmallVec::<[Operations; 4]>::new(); let apply_cmd = if args.cmd_operations { match validate_operations( @@ -727,7 +728,7 @@ fn validate_operations( flag_replacement: &str, flag_new_column: Option<&String>, flag_formatstr: &str, -) -> Result, CliError> { +) -> Result, CliError> { let mut censor_invokes = 0_u8; let mut copy_invokes = 0_u8; let mut eudex_invokes = 0_u8; @@ -738,7 +739,7 @@ fn validate_operations( let mut strip_invokes = 0_u8; let mut whatlang_invokes = 0_u8; - let mut ops_vec: Vec = Vec::with_capacity(operations.len()); + let mut ops_vec = SmallVec::with_capacity(operations.len()); for op in operations { let Ok(operation) = Operations::from_str(op) else { @@ -960,7 +961,7 @@ fn validate_operations( #[inline] fn apply_operations( - ops_vec: &Vec, + ops_vec: &SmallVec<[Operations; 4]>, cell: &mut String, comparand: &str, replacement: &str, diff --git a/src/cmd/applydp.rs b/src/cmd/applydp.rs index f4e9215e7..fa42f80d6 100644 --- a/src/cmd/applydp.rs +++ b/src/cmd/applydp.rs @@ -193,6 +193,7 @@ use rayon::{ }; use regex::Regex; use serde::Deserialize; +use smallvec::SmallVec; use strum_macros::EnumString; use crate::{ @@ -331,7 +332,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { EmptyReplace, } - let mut ops_vec: Vec = Vec::new(); + let mut ops_vec = SmallVec::<[Operations; 4]>::new(); let applydp_cmd = if args.cmd_operations { match validate_operations( @@ -496,13 +497,13 @@ fn validate_operations( flag_replacement: &str, flag_new_column: &Option, flag_formatstr: &str, -) -> Result, CliError> { +) -> Result, CliError> { let mut copy_invokes = 0_u8; let mut regex_replace_invokes = 0_u8; let mut replace_invokes = 0_u8; let mut strip_invokes = 0_u8; - let mut ops_vec: Vec = Vec::with_capacity(operations.len()); + let mut ops_vec = SmallVec::with_capacity(operations.len()); for op in operations { let Ok(operation) = Operations::from_str(op) else { @@ -587,7 +588,7 @@ fn validate_operations( #[inline] fn applydp_operations( - ops_vec: &Vec, + ops_vec: &SmallVec<[Operations; 4]>, cell: &mut String, comparand: &str, replacement: &str, From bc837ae698f3aee06ea9b846b98ea0c75820a22d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 10:23:29 -0400 Subject: [PATCH 078/119] `apply`: optimize - use lazy OnceLock initialization for GENDER_GUESER - lazily init SENTIMENT_ANALYZER - use itoa and ryu for accelerated integer and float to alpha conversions - use more to_owned, into_owned and clone_into instead of to_string to minimize allocations --- src/cmd/apply.rs | 79 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/src/cmd/apply.rs b/src/cmd/apply.rs index 12b6562f1..dd63820c4 100644 --- a/src/cmd/apply.rs +++ b/src/cmd/apply.rs @@ -418,6 +418,7 @@ static SENTIMENT_ANALYZER: OnceLock = OnceLock::new( static THOUSANDS_POLICY: OnceLock = OnceLock::new(); static ROUND_PLACES: OnceLock = OnceLock::new(); static WHATLANG_CONFIDENCE_THRESHOLD: OnceLock = OnceLock::new(); +static GENDER_GUESSER: OnceLock = OnceLock::new(); // default confidence threshold for whatlang language detection - 90% confidence const DEFAULT_THRESHOLD: f64 = 0.9; @@ -831,6 +832,15 @@ fn validate_operations( "--new_column (-c) is required for sentiment operation." ); } + if sentiment_invokes == 0 { + if SENTIMENT_ANALYZER + .set(SentimentIntensityAnalyzer::new()) + .is_err() + { + return fail!("Cannot initialize Sentiment Analyzer."); + } + } + sentiment_invokes = sentiment_invokes.saturating_add(1); }, Operations::Simdl @@ -932,6 +942,9 @@ fn validate_operations( "--new_column (-c) is required for Gender_Guess" ); } + if GENDER_GUESSER.set(gender_guesser::Detector::new()).is_err() { + return fail!("Cannot initialize Gender Detector."); + } }, _ => {}, } @@ -970,7 +983,7 @@ fn apply_operations( for op in ops_vec { match op { Operations::Len => { - *cell = cell.len().to_string(); + itoa::Buffer::new().format(cell.len()).clone_into(cell); }, Operations::Lower => { *cell = cell.to_lowercase(); @@ -980,11 +993,11 @@ fn apply_operations( }, Operations::Squeeze => { let squeezer: &'static Regex = regex_oncelock!(r"\s+"); - *cell = squeezer.replace_all(cell, " ").to_string(); + *cell = squeezer.replace_all(cell, " ").into_owned(); }, Operations::Squeeze0 => { let squeezer: &'static Regex = regex_oncelock!(r"\s+"); - *cell = squeezer.replace_all(cell, "").to_string(); + *cell = squeezer.replace_all(cell, "").into_owned(); }, Operations::Trim => { *cell = String::from(cell.trim()); @@ -1038,18 +1051,21 @@ fn apply_operations( // including selection of the best algorithm repeatedly at runtime let mut crc32_hasher = CRC32.get().unwrap().clone(); crc32_hasher.update(cell.as_bytes()); - *cell = crc32_hasher.finalize().to_string(); + itoa::Buffer::new() + .format(crc32_hasher.finalize()) + .clone_into(cell); }, Operations::Gender_Guess => { - let gender_detector = gender_guesser::Detector::new(); + // safety: we set GENDER_GUESSER in validate_operations() + let gender_detector = GENDER_GUESSER.get().unwrap(); *cell = match gender_detector.get_gender(cell) { - Gender::Male => "Male".to_string(), - Gender::Female => "Female".to_string(), - Gender::MayBeMale => "MayBeMale".to_string(), - Gender::MayBeFemale => "MayBeFemale".to_string(), - Gender::BothMaleFemale => "BothMaleFemale".to_string(), - Gender::NotSure => "NotSure".to_string(), - Gender::NotFound => "NotFound".to_string(), + Gender::Male => "Male".to_owned(), + Gender::Female => "Female".to_owned(), + Gender::MayBeMale => "MayBeMale".to_owned(), + Gender::MayBeFemale => "MayBeFemale".to_owned(), + Gender::BothMaleFemale => "BothMaleFemale".to_owned(), + Gender::NotSure => "NotSure".to_owned(), + Gender::NotFound => "NotFound".to_owned(), }; }, Operations::Escape => { @@ -1074,7 +1090,7 @@ fn apply_operations( Operations::Regex_Replace => { // safety: we set REGEX_REPLACE in validate_operations() let regexreplace = REGEX_REPLACE.get().unwrap(); - *cell = regexreplace.replace_all(cell, replacement).to_string(); + *cell = regexreplace.replace_all(cell, replacement).into_owned(); }, Operations::Censor => { // safety: we set CENSOR in validate_operations() @@ -1084,12 +1100,14 @@ fn apply_operations( Operations::Censor_Check => { // safety: we set CENSOR in validate_operations() let censor = CENSOR.get().unwrap(); - *cell = censor.check(cell).to_string(); + if censor.check(cell) { "true" } else { "false" }.clone_into(cell); }, Operations::Censor_Count => { // safety: we set CENSOR in validate_operations() let censor = CENSOR.get().unwrap(); - *cell = censor.count(cell).to_string(); + itoa::Buffer::new() + .format(censor.count(cell)) + .clone_into(cell); }, Operations::Thousands => { if let Ok(num) = cell.parse::() { @@ -1170,25 +1188,37 @@ fn apply_operations( } }, Operations::Simdl => { - *cell = damerau_levenshtein(cell, comparand).to_string(); + itoa::Buffer::new() + .format(damerau_levenshtein(cell, comparand)) + .clone_into(cell); }, Operations::Simdln => { - *cell = normalized_damerau_levenshtein(cell, comparand).to_string(); + ryu::Buffer::new() + .format_finite(normalized_damerau_levenshtein(cell, comparand)) + .clone_into(cell); }, Operations::Simjw => { - *cell = jaro_winkler(cell, comparand).to_string(); + ryu::Buffer::new() + .format_finite(jaro_winkler(cell, comparand)) + .clone_into(cell); }, Operations::Simsd => { - *cell = sorensen_dice(cell, comparand).to_string(); + ryu::Buffer::new() + .format_finite(sorensen_dice(cell, comparand)) + .clone_into(cell); }, Operations::Simhm => { let ham_val = hamming(cell, comparand); match ham_val { - Ok(val) => *cell = val.to_string(), + Ok(val) => itoa::Buffer::new().format(val).clone_into(cell), Err(_) => *cell = String::from("ERROR: Different lengths"), } }, - Operations::Simod => *cell = osa_distance(cell, comparand).to_string(), + Operations::Simod => { + itoa::Buffer::new() + .format(osa_distance(cell, comparand)) + .clone_into(cell); + }, Operations::Eudex => { // safety: we set EUDEX_COMPARAND_HASH in validate_operations() let eudex_comparand_hash = EUDEX_COMPARAND_HASH.get().unwrap(); @@ -1197,10 +1227,11 @@ fn apply_operations( }, Operations::Sentiment => { // safety: we set SENTIMENT_ANALYZER in validate_operations() - let sentiment_analyzer = - SENTIMENT_ANALYZER.get_or_init(SentimentIntensityAnalyzer::new); + let sentiment_analyzer = SENTIMENT_ANALYZER.get().unwrap(); let sentiment_scores = sentiment_analyzer.polarity_scores(cell); - *cell = sentiment_scores.get("compound").unwrap_or(&0.0).to_string(); + ryu::Buffer::new() + .format_finite(*sentiment_scores.get("compound").unwrap_or(&0.0)) + .clone_into(cell); }, Operations::Whatlang => { let lang_info = detect(cell); From ffa81b1d0217cb149767d2aa5445b33e19aa6114 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 10:27:28 -0400 Subject: [PATCH 079/119] `tests`: `apply` - ryu formats 0_f64 to "0.0" --- tests/test_apply.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_apply.rs b/tests/test_apply.rs index 00f8613ca..7df293d11 100644 --- a/tests/test_apply.rs +++ b/tests/test_apply.rs @@ -2188,8 +2188,8 @@ fn apply_ops_similarity() { svec!["name", "name_sim_score"], svec!["John", "0.5"], svec!["Jonathan", "0.25"], - svec!["Edna", "0"], - svec!["Larry", "0"], + svec!["Edna", "0.0"], + svec!["Larry", "0.0"], ]; assert_eq!(got, expected); } From d5b843ff82b44b79820016b38d3c99ca76fcbdbf Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 10:28:04 -0400 Subject: [PATCH 080/119] `applydp`: use to_owned and into_owned instead of to_string to minimize allocs --- src/cmd/applydp.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cmd/applydp.rs b/src/cmd/applydp.rs index fa42f80d6..7922846c8 100644 --- a/src/cmd/applydp.rs +++ b/src/cmd/applydp.rs @@ -596,7 +596,7 @@ fn applydp_operations( for op in ops_vec { match op { Operations::Len => { - *cell = cell.len().to_string(); + *cell = itoa::Buffer::new().format(cell.len()).to_owned(); }, Operations::Lower => { *cell = cell.to_lowercase(); @@ -606,11 +606,11 @@ fn applydp_operations( }, Operations::Squeeze => { let squeezer: &'static Regex = regex_oncelock!(r"\s+"); - *cell = squeezer.replace_all(cell, " ").to_string(); + *cell = squeezer.replace_all(cell, " ").into_owned(); }, Operations::Squeeze0 => { let squeezer: &'static Regex = regex_oncelock!(r"\s+"); - *cell = squeezer.replace_all(cell, "").to_string(); + *cell = squeezer.replace_all(cell, "").into_owned(); }, Operations::Trim => { *cell = String::from(cell.trim()); @@ -650,7 +650,7 @@ fn applydp_operations( Operations::Regex_Replace => { // safety: we set REGEX_REPLACE in validate_operations() let regexreplace = REGEX_REPLACE.get().unwrap(); - *cell = regexreplace.replace_all(cell, replacement).to_string(); + *cell = regexreplace.replace_all(cell, replacement).into_owned(); }, Operations::Round => { if let Ok(num) = cell.parse::() { From b07ad1a7faccb5063976dc1b4e3220d38d5455a6 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 11:32:39 -0400 Subject: [PATCH 081/119] `stats`: use smallvec for INFER_DATE_FLAGS --- src/cmd/stats.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 6d2001d95..cdf321e7c 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -256,6 +256,7 @@ use qsv_dateparser::parse_with_preference; use serde::{Deserialize, Serialize}; use simd_json::{prelude::ValueAsScalar, OwnedValue}; use simdutf8::basic::from_utf8; +use smallvec::{smallvec, SmallVec}; use stats::{merge_all, Commute, MinMax, OnlineStats, Unsorted}; use tempfile::NamedTempFile; use threadpool::ThreadPool; @@ -473,7 +474,7 @@ pub static STATSDATA_TYPES_ARRAY: [JsonTypes; MAX_STAT_COLUMNS] = [ JsonTypes::Int, //antimode_occurrences ]; -static INFER_DATE_FLAGS: OnceLock> = OnceLock::new(); +static INFER_DATE_FLAGS: OnceLock> = OnceLock::new(); static RECORD_COUNT: OnceLock = OnceLock::new(); // standard overflow and underflow strings @@ -1161,14 +1162,14 @@ fn init_date_inference( if !infer_dates { // we're not inferring dates, set INFER_DATE_FLAGS to all false INFER_DATE_FLAGS - .set(vec![false; headers.len()]) + .set(smallvec![false; headers.len()]) .map_err(|e| format!("Cannot init empty date inference flags: {e:?}"))?; return Ok(()); } let infer_date_flags = if flag_whitelist.eq_ignore_ascii_case("all") { log::info!("inferring dates for ALL fields"); - vec![true; headers.len()] + smallvec![true; headers.len()] } else { let mut header_str = String::new(); let whitelist_lower = flag_whitelist.to_lowercase(); From 77171b4852918e0f4a08ac246d6ca7e2b4ae768c Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 11:54:19 -0400 Subject: [PATCH 082/119] `deps`: bump polars to latest upstream --- Cargo.lock | 38 +++++++++++++++++++------------------- Cargo.toml | 4 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5675d456a..b36ded463 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4636,7 +4636,7 @@ dependencies = [ [[package]] name = "polars" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "getrandom", "polars-arrow", @@ -4655,7 +4655,7 @@ dependencies = [ [[package]] name = "polars-arrow" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "atoi", @@ -4702,7 +4702,7 @@ dependencies = [ [[package]] name = "polars-compute" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "bytemuck", "either", @@ -4717,7 +4717,7 @@ dependencies = [ [[package]] name = "polars-core" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4751,7 +4751,7 @@ dependencies = [ [[package]] name = "polars-error" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "avro-schema", "object_store", @@ -4764,7 +4764,7 @@ dependencies = [ [[package]] name = "polars-expr" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4784,7 +4784,7 @@ dependencies = [ [[package]] name = "polars-io" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "async-trait", @@ -4830,7 +4830,7 @@ dependencies = [ [[package]] name = "polars-json" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "chrono", @@ -4851,7 +4851,7 @@ dependencies = [ [[package]] name = "polars-lazy" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4877,7 +4877,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "futures", "memmap2", @@ -4898,7 +4898,7 @@ dependencies = [ [[package]] name = "polars-ops" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "argminmax", @@ -4931,7 +4931,7 @@ dependencies = [ [[package]] name = "polars-parquet" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "async-stream", @@ -4968,7 +4968,7 @@ dependencies = [ [[package]] name = "polars-pipe" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -4994,7 +4994,7 @@ dependencies = [ [[package]] name = "polars-plan" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "bitflags 2.6.0", @@ -5028,7 +5028,7 @@ dependencies = [ [[package]] name = "polars-row" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "bytemuck", "polars-arrow", @@ -5039,7 +5039,7 @@ dependencies = [ [[package]] name = "polars-schema" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "indexmap", "polars-error", @@ -5051,7 +5051,7 @@ dependencies = [ [[package]] name = "polars-sql" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "hex", "once_cell", @@ -5072,7 +5072,7 @@ dependencies = [ [[package]] name = "polars-time" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "atoi", "bytemuck", @@ -5092,7 +5092,7 @@ dependencies = [ [[package]] name = "polars-utils" version = "0.43.1" -source = "git+https://github.com/pola-rs/polars?rev=01a4e06#01a4e067f50fd5f8825d4b25402336310e125d8d" +source = "git+https://github.com/pola-rs/polars?rev=bfdd496#bfdd49622537b9777cfeb65313af76f696ceb7d0" dependencies = [ "ahash", "bytemuck", diff --git a/Cargo.toml b/Cargo.toml index 352241129..637a6b301 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -315,9 +315,9 @@ strum_macros = { git = "https://github.com/jqnatividad/strum", branch = "bump-ph # BUILD NOTE: Be sure to set QSV_POLARS_REV below to the latest commit short hash or tag # of polars/py-polars before building qsv. This allows us to show the polars rev/tag in --version. # if we are using a release version of Rust Polars, leave QSV_POLARS_REV empty -# QSV_POLARS_REV=01a4e06 +# QSV_POLARS_REV=bfdd496 # polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.9.0" } -polars = { git = "https://github.com/pola-rs/polars", rev = "01a4e06" } +polars = { git = "https://github.com/pola-rs/polars", rev = "bfdd496" } [features] default = ["mimalloc"] From 98064e18c075a8b6276907c0ed73aa4456c72a6b Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 12:07:11 -0400 Subject: [PATCH 083/119] `excel`: use to_owned instead of to_string --- src/cmd/excel.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cmd/excel.rs b/src/cmd/excel.rs index 58788b4ad..11b584c2a 100644 --- a/src/cmd/excel.rs +++ b/src/cmd/excel.rs @@ -1043,9 +1043,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> { if write!(formatted_date, "{}", dt.format(&date_format)) .is_ok() { - // the format string was ok, so use to_string() + // the format string was ok, so use to_owned() // to actually apply the DelayedFormat - work_date = formatted_date.to_string(); + work_date = formatted_date.to_owned(); } else { // if there was a format error, revert to the // default format From 59a830102a2e9b66e62ed1ec13381daaa6eb88c2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 19 Oct 2024 17:56:00 +0000 Subject: [PATCH 084/119] build(deps): bump serde_json from 1.0.131 to 1.0.132 Bumps [serde_json](https://github.com/serde-rs/json) from 1.0.131 to 1.0.132. - [Release notes](https://github.com/serde-rs/json/releases) - [Commits](https://github.com/serde-rs/json/compare/1.0.131...1.0.132) --- updated-dependencies: - dependency-name: serde_json dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b36ded463..60836bd3a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6301,9 +6301,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.131" +version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67d42a0bd4ac281beff598909bb56a86acaf979b84483e1c79c10dcaf98f8cf3" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ "indexmap", "itoa", From 18001e6b4cc190923282102e60db18f23a756ae5 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 16:34:48 -0400 Subject: [PATCH 085/119] `deps`: bump syn from 2.0.79 to 2.0.80 --- Cargo.lock | 102 ++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 60836bd3a..94ccc4b03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -718,7 +718,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -758,7 +758,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -775,7 +775,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1024,7 +1024,7 @@ checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1106,7 +1106,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1312,7 +1312,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1823,7 +1823,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1847,7 +1847,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1858,7 +1858,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1947,7 +1947,7 @@ checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1958,7 +1958,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -1971,7 +1971,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -2020,7 +2020,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -2153,7 +2153,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -2174,7 +2174,7 @@ checksum = "de0d48a183585823424a4ce1aa132d174a6a81bd540895822eb4c8373a8e49e8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -2457,7 +2457,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -2584,7 +2584,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -3166,7 +3166,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -3269,7 +3269,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c" dependencies = [ "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -4540,7 +4540,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -4569,7 +4569,7 @@ checksum = "a4502d8515ca9f32f1fb543d987f63d95a14934883db45bdb48060b6b69257f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -5296,7 +5296,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -5309,7 +5309,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -5763,7 +5763,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -5845,7 +5845,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -6296,7 +6296,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -6320,7 +6320,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -6368,7 +6368,7 @@ checksum = "82fe9db325bcef1fbcde82e078a5cc4efdf787e96b3b9cf45b50b529f2083d67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -6579,7 +6579,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -6731,7 +6731,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -6753,9 +6753,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.79" +version = "2.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +checksum = "e6e185e337f816bc8da115b8afcb3324006ccc82eeaddf35113888d3bd8e44ac" dependencies = [ "proc-macro2", "quote", @@ -6779,7 +6779,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -6860,7 +6860,7 @@ checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -6999,7 +6999,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -7095,7 +7095,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -7180,7 +7180,7 @@ checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -7476,7 +7476,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", "wasm-bindgen-shared", ] @@ -7510,7 +7510,7 @@ checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -7788,7 +7788,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -7799,7 +7799,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -7810,7 +7810,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -7821,7 +7821,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -8105,7 +8105,7 @@ checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", "synstructure", ] @@ -8156,7 +8156,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", "zvariant_utils", ] @@ -8189,7 +8189,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -8209,7 +8209,7 @@ checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", "synstructure", ] @@ -8230,7 +8230,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -8252,7 +8252,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] [[package]] @@ -8359,7 +8359,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", "zvariant_utils", ] @@ -8371,5 +8371,5 @@ checksum = "c51bcff7cc3dbb5055396bcf774748c3dab426b4b8659046963523cee4808340" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.80", ] From a812100d9f74b7777f80994bf3cbef14413ed5ab Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 16:37:32 -0400 Subject: [PATCH 086/119] `deps`: update csvlens to latest upstream --- Cargo.lock | 2 +- Cargo.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94ccc4b03..2a48b5544 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1749,7 +1749,7 @@ dependencies = [ [[package]] name = "csvlens" version = "0.10.1" -source = "git+https://github.com/YS-L/csvlens?rev=002edeb#002edebeda69b0b81b4fd4060e25e80c5a87f4da" +source = "git+https://github.com/YS-L/csvlens?rev=b3fab72#b3fab72c4fae6b03072cc0e27d6ff60c322b2011" dependencies = [ "anyhow", "arboard", diff --git a/Cargo.toml b/Cargo.toml index 637a6b301..e418ec337 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -279,8 +279,8 @@ csv = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-opt csv-core = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-optimized" } csv-index = { git = "https://github.com/jqnatividad/rust-csv", branch = "qsv-optimized" } -# use latest csvlens upstream with latest dependencies, including arrow 53, with lexical-core fix -csvlens = { git = "https://github.com/YS-L/csvlens", rev = "002edeb" } +# use latest csvlens upstream with latest dependencies and fixes +csvlens = { git = "https://github.com/YS-L/csvlens", rev = "b3fab72" } # modernized fork of crc32fast, 2021 edition, MSRV 1.81, select clippy lint suggestions applied # crc32fast = { git = "https://github.com/jqnatividad/rust-crc32fast", branch = "modernize" } From 5afb20c2c1a3cf6f6bbc3539f90b8d5f85a49eaf Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 16:46:49 -0400 Subject: [PATCH 087/119] `deps`: bump governor from 0.6.3 to 0.6.4 with our bumped dashmap from 5.1.0 to 6.1.0 --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2a48b5544..b6c6a42c9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2729,8 +2729,8 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "governor" -version = "0.6.3" -source = "git+https://github.com/jqnatividad/governor?branch=deps-bump-dashmap#f93388e4e2109b529d78b42cc66fe12c5d787e8f" +version = "0.6.4" +source = "git+https://github.com/jqnatividad/governor?branch=bump-dashmap-to-6.1.0#7668120b6f45211fcda3e2d4cc61c3f1d673a440" dependencies = [ "cfg-if", "dashmap", diff --git a/Cargo.toml b/Cargo.toml index e418ec337..f3fa98720 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -290,7 +290,7 @@ csvlens = { git = "https://github.com/YS-L/csvlens", rev = "b3fab72" } dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" } # use our fork of governor with bumped dashmap dependency -governor = { git = "https://github.com/jqnatividad/governor", branch = "deps-bump-dashmap" } +governor = { git = "https://github.com/jqnatividad/governor", branch = "bump-dashmap-to-6.1.0" } # needed to get latest dependencies and unreleased fixes grex = { git = "https://github.com/pemistahl/grex", rev = "ff8533d" } From 98ce6e77f718c83f9108c98a56b0b2e10175356e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 17:08:23 -0400 Subject: [PATCH 088/119] clippy::collapsible_if warning: this `if` statement can be collapsed --> src/cmd/apply.rs:835:17 | 835 | / if sentiment_invokes == 0 { 836 | | if SENTIMENT_ANALYZER 837 | | .set(SentimentIntensityAnalyzer::new()) 838 | | .is_err() ... | 841 | | } 842 | | } | |_________________^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#collapsible_if = note: `#[warn(clippy::collapsible_if)]` on by default help: collapse nested if block | 835 ~ if sentiment_invokes == 0 && SENTIMENT_ANALYZER 836 + .set(SentimentIntensityAnalyzer::new()) 837 + .is_err() { 838 + return fail!("Cannot initialize Sentiment Analyzer."); 839 + } --- src/cmd/apply.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/cmd/apply.rs b/src/cmd/apply.rs index dd63820c4..54d13421b 100644 --- a/src/cmd/apply.rs +++ b/src/cmd/apply.rs @@ -832,13 +832,12 @@ fn validate_operations( "--new_column (-c) is required for sentiment operation." ); } - if sentiment_invokes == 0 { - if SENTIMENT_ANALYZER + if sentiment_invokes == 0 + && SENTIMENT_ANALYZER .set(SentimentIntensityAnalyzer::new()) .is_err() - { - return fail!("Cannot initialize Sentiment Analyzer."); - } + { + return fail!("Cannot initialize Sentiment Analyzer."); } sentiment_invokes = sentiment_invokes.saturating_add(1); From 26cd930ae40443e4fa73d0bc962aed1a07a07431 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 17:17:23 -0400 Subject: [PATCH 089/119] `excel`: revert back to using to_string as DelayedFormat actually requires it to apply the format --- src/cmd/excel.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cmd/excel.rs b/src/cmd/excel.rs index 11b584c2a..58788b4ad 100644 --- a/src/cmd/excel.rs +++ b/src/cmd/excel.rs @@ -1043,9 +1043,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> { if write!(formatted_date, "{}", dt.format(&date_format)) .is_ok() { - // the format string was ok, so use to_owned() + // the format string was ok, so use to_string() // to actually apply the DelayedFormat - work_date = formatted_date.to_owned(); + work_date = formatted_date.to_string(); } else { // if there was a format error, revert to the // default format From f205809549ac275078a95bc2821a583611955ad0 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 17:19:24 -0400 Subject: [PATCH 090/119] `stats`: make STATSDATA_TYPES_ARRAY as const instead of a static so it becomes a compile-time constant --- src/cmd/stats.rs | 8 ++++++-- src/util.rs | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index cdf321e7c..9ff7c2b06 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -434,7 +434,7 @@ pub enum JsonTypes { // we use this to serialize the StatsData data structure // to a JSONL file using serde_json -pub static STATSDATA_TYPES_ARRAY: [JsonTypes; MAX_STAT_COLUMNS] = [ +const STATSDATA_TYPES_ARRAY: [JsonTypes; MAX_STAT_COLUMNS] = [ JsonTypes::String, //field JsonTypes::String, //type JsonTypes::Bool, //is_ascii @@ -497,6 +497,10 @@ const MAX_ANTIMODES: usize = 10; // maximum length of antimode string before truncating and appending "..." const MAX_ANTIMODE_LEN: usize = 100; +pub const fn get_stats_data_types() -> [JsonTypes; MAX_STAT_COLUMNS] { + STATSDATA_TYPES_ARRAY +} + pub fn run(argv: &[&str]) -> CliResult<()> { let mut args: Args = util::get_args(USAGE, argv)?; if args.flag_typesonly { @@ -887,7 +891,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // save the stats data to ".stats.csv.data.jsonl" if write_stats_jsonl { stats_pathbuf.set_extension("data.jsonl"); - util::csv_to_jsonl(&currstats_filename, &STATSDATA_TYPES_ARRAY, stats_pathbuf)?; + util::csv_to_jsonl(&currstats_filename, &get_stats_data_types(), stats_pathbuf)?; } } } diff --git a/src/util.rs b/src/util.rs index b12b68a51..844317286 100644 --- a/src/util.rs +++ b/src/util.rs @@ -28,7 +28,7 @@ use sysinfo::System; #[cfg(feature = "polars")] use crate::cmd::count::polars_count_input; use crate::{ - cmd::stats::{JsonTypes, StatsData, STATSDATA_TYPES_ARRAY}, + cmd::stats::{get_stats_data_types, JsonTypes, StatsData}, config, config::{Config, Delimiter, DEFAULT_RDR_BUFFER_CAPACITY, DEFAULT_WTR_BUFFER_CAPACITY}, select::SelectColumns, @@ -2116,7 +2116,7 @@ pub fn get_stats_records( // create a statsdatajon from the output of the stats command csv_to_jsonl( &tempfile_path, - &STATSDATA_TYPES_ARRAY, + &get_stats_data_types(), statsdatajson_path.clone(), )?; From e26c27f58df688d7bfb2185ad54d4fe010b1fccf Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 18:01:16 -0400 Subject: [PATCH 091/119] `stats`: use unwrap_unchecked instead of unwrap in perf-critical hot loop we already use unwrap amyway, might as well use unwrap_unchecked for stuff we know will not panic to get rid of panic code --- src/cmd/stats.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 9ff7c2b06..2b0405b2c 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -951,12 +951,19 @@ impl Args { // if it does return an Err, you have a bigger problem as the index file was // modified WHILE stats is running and you NEED to abort if that // happens, however unlikely - let mut idx = args.rconfig().indexed().unwrap().unwrap(); + let mut idx = unsafe { + args.rconfig() + .indexed() + .unwrap_unchecked() + .unwrap_unchecked() + }; idx.seek((i * chunk_size) as u64) .expect("File seek failed."); let it = idx.byte_records().take(chunk_size); // safety: this will only return an Error if the channel has been disconnected - send.send(args.compute(&sel, it)).unwrap(); + unsafe { + send.send(args.compute(&sel, it)).unwrap_unchecked(); + } }); } drop(send); From d36eb9587b74afc87bbaa412dce2672deafbdeeb Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 18:16:46 -0400 Subject: [PATCH 092/119] `tests`: remove macOS CI test only manually dispatchable Now that Apple Silicon is widely deployed and the last Intel-powered mac is about 5 years ago, disable auto-CI tests on x86 as its too darn slow [skip ci] --- .github/workflows/rust-macos-x86.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust-macos-x86.yml b/.github/workflows/rust-macos-x86.yml index 36b410097..e23a1071e 100644 --- a/.github/workflows/rust-macos-x86.yml +++ b/.github/workflows/rust-macos-x86.yml @@ -1,10 +1,10 @@ name: macOS x86_64-apple-darwin on: - push: - branches: [ master ] - pull_request: - branches: [ master ] + # push: + # branches: [ master ] + # pull_request: + # branches: [ master ] workflow_dispatch: concurrency: From 4579c1bfba4eca21d7480694780e39f6966a88a0 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 18:36:15 -0400 Subject: [PATCH 093/119] `stats`: cache njobs result --- src/cmd/stats.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 2b0405b2c..0b17d1dce 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -497,6 +497,7 @@ const MAX_ANTIMODES: usize = 10; // maximum length of antimode string before truncating and appending "..." const MAX_ANTIMODE_LEN: usize = 100; +// we do this so this is evaluated at compile-time pub const fn get_stats_data_types() -> [JsonTypes; MAX_STAT_COLUMNS] { STATSDATA_TYPES_ARRAY } @@ -939,10 +940,11 @@ impl Args { init_date_inference(self.flag_infer_dates, &headers, whitelist)?; - let chunk_size = util::chunk_size(idx_count as usize, util::njobs(self.flag_jobs)); + let njobs = util::njobs(self.flag_jobs); + let chunk_size = util::chunk_size(idx_count as usize, njobs); let nchunks = util::num_of_chunks(idx_count as usize, chunk_size); - let pool = ThreadPool::new(util::njobs(self.flag_jobs)); + let pool = ThreadPool::new(njobs); let (send, recv) = crossbeam_channel::bounded(0); for i in 0..nchunks { let (send, args, sel) = (send.clone(), self.clone(), sel.clone()); From 00e3493bb72836f1a86deabc1564ccff4bab753a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 18:41:56 -0400 Subject: [PATCH 094/119] utils: njobs helper - remove logging in perf-critical code; use itoa instead of to_string --- src/util.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util.rs b/src/util.rs index 844317286..25556fc4e 100644 --- a/src/util.rs +++ b/src/util.rs @@ -135,8 +135,8 @@ pub fn njobs(flag_jobs: Option) -> usize { jobs } }); - env::set_var("RAYON_NUM_THREADS", jobs_to_use.to_string()); - log::info!("Using {jobs_to_use} jobs..."); + env::set_var("RAYON_NUM_THREADS", itoa::Buffer::new().format(jobs_to_use)); + // log::info!("Using {jobs_to_use} jobs..."); jobs_to_use } From b31a2314f1de98340183e7426819ac539c63b85d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 19:11:55 -0400 Subject: [PATCH 095/119] adjust to new clippy:regex_creation_in_loops lint that is false positive as we're using oncelock in the macro --- src/util.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/util.rs b/src/util.rs index 25556fc4e..867db8735 100644 --- a/src/util.rs +++ b/src/util.rs @@ -39,6 +39,7 @@ use crate::{ macro_rules! regex_oncelock { ($re:literal $(,)?) => {{ static RE: std::sync::OnceLock = std::sync::OnceLock::new(); + #[allow(clippy::regex_creation_in_loops)] // false positive as we use oncelock RE.get_or_init(|| regex::Regex::new($re).expect("Invalid regex")) }}; } From b4822ff1ee1e95fad1b9022a9ba2dddf27ee487f Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 19:13:49 -0400 Subject: [PATCH 096/119] clippy::manual_ignore_case_cmp warning: manual case-insensitive ASCII comparison --> src/cmd/datefmt.rs:268:16 | 268 | if tz.to_ascii_lowercase() == "local" { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_ignore_case_cmp = note: `#[warn(clippy::manual_ignore_case_cmp)]` on by default help: consider using `.eq_ignore_ascii_case()` instead | 268 | if tz.eq_ignore_ascii_case("local") { | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ warning: manual case-insensitive ASCII comparison --> src/cmd/datefmt.rs:285:15 | 285 | } else if args.flag_input_tz.to_ascii_lowercase() == "local" { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_ignore_case_cmp help: consider using `.eq_ignore_ascii_case()` instead | 285 | } else if args.flag_input_tz.eq_ignore_ascii_case("local") { | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ warning: manual case-insensitive ASCII comparison --> src/cmd/datefmt.rs:298:15 | 298 | } else if args.flag_output_tz.to_ascii_lowercase() == "local" { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_ignore_case_cmp help: consider using `.eq_ignore_ascii_case()` instead | 298 | } else if args.flag_output_tz.eq_ignore_ascii_case("local") { | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ warning: manual case-insensitive ASCII comparison --> src/cmd/foreach.rs:138:16 | 138 | str if str.to_ascii_lowercase() == "true" => true, | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_ignore_case_cmp help: consider using `.eq_ignore_ascii_case()` instead | 138 | str if str.eq_ignore_ascii_case("true") => true, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ warning: manual case-insensitive ASCII comparison --> src/cmd/foreach.rs:139:16 | 139 | str if str.to_ascii_lowercase() == "false" => false, | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_ignore_case_cmp help: consider using `.eq_ignore_ascii_case()` instead | 139 | str if str.eq_ignore_ascii_case("false") => false, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ warning: manual case-insensitive ASCII comparison --> src/cmd/prompt.rs:146:12 | 146 | if args.flag_filters.to_ascii_lowercase() != "none" { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_ignore_case_cmp help: consider using `.eq_ignore_ascii_case()` instead | 146 | if !args.flag_filters.eq_ignore_ascii_case("none") { --- src/cmd/datefmt.rs | 6 +++--- src/cmd/foreach.rs | 4 ++-- src/cmd/prompt.rs | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cmd/datefmt.rs b/src/cmd/datefmt.rs index e2b8a5410..d31280764 100644 --- a/src/cmd/datefmt.rs +++ b/src/cmd/datefmt.rs @@ -265,7 +265,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // set timezone variables let default_tz = match args.flag_default_tz.as_deref() { Some(tz) => { - if tz.to_ascii_lowercase() == "local" { + if tz.eq_ignore_ascii_case("local") { if let Some(tz) = localzone::get_local_zone() { log::info!("default-tz local timezone: {tz}"); tz.parse::()? @@ -282,7 +282,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut input_tz = if let Ok(tz) = args.flag_input_tz.parse::() { tz - } else if args.flag_input_tz.to_ascii_lowercase() == "local" { + } else if args.flag_input_tz.eq_ignore_ascii_case("local") { if let Some(tz) = localzone::get_local_zone() { log::info!("input-tz local timezone: {tz}"); tz.parse::()? @@ -295,7 +295,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { #[allow(clippy::useless_let_if_seq)] // more readable this way let mut output_tz = if let Ok(tz) = args.flag_output_tz.parse::() { tz - } else if args.flag_output_tz.to_ascii_lowercase() == "local" { + } else if args.flag_output_tz.eq_ignore_ascii_case("local") { if let Some(tz) = localzone::get_local_zone() { log::info!("output-tz local timezone: {tz}"); tz.parse::()? diff --git a/src/cmd/foreach.rs b/src/cmd/foreach.rs index 69c8cad51..2639aa98d 100644 --- a/src/cmd/foreach.rs +++ b/src/cmd/foreach.rs @@ -135,8 +135,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let mut dry_run_fname = String::new(); let dry_run = match args.flag_dry_run.as_str() { - str if str.to_ascii_lowercase() == "true" => true, - str if str.to_ascii_lowercase() == "false" => false, + str if str.eq_ignore_ascii_case("true") => true, + str if str.eq_ignore_ascii_case("false") => false, file_str => { // if the value is not "true" or "false" case-insensitive, it's a file name // check if we can create the file diff --git a/src/cmd/prompt.rs b/src/cmd/prompt.rs index 588476c0a..510084673 100644 --- a/src/cmd/prompt.rs +++ b/src/cmd/prompt.rs @@ -143,7 +143,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { .set_directory(args.flag_workdir.clone()) .set_title(title.clone()); - if args.flag_filters.to_ascii_lowercase() != "none" { + if !args.flag_filters.eq_ignore_ascii_case("none") { let ext_comma_delimited: Vec<&str> = args.flag_filters.split(',').collect(); let ext_slice: &[&str] = &ext_comma_delimited; if !ext_slice.is_empty() { From b4ac77c4e12ea65b6115c639774e1d4d6aaaf453 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 19:18:28 -0400 Subject: [PATCH 097/119] sync cargo description to github description --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f3fa98720..278178619 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "qsv" version = "0.136.0" #:version authors = ["Joel Natividad "] -description = "A high performance CSV data-wrangling toolkit." +description = "A Blazing-Fast Data-wrangling toolkit." documentation = "https://github.com/jqnatividad/qsv#qsv-ultra-fast-csv-data-wrangling-toolkit" homepage = "https://qsv.dathere.com" repository = "https://github.com/jqnatividad/qsv" From fb1af0f0766f92098a0522fd8f7d9dae3315561f Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 19 Oct 2024 19:21:58 -0400 Subject: [PATCH 098/119] `deps`: use latest unreleased governor upstream with bumped dashmap dependency --- Cargo.lock | 2 +- Cargo.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b6c6a42c9..36bc68e8e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2730,7 +2730,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "governor" version = "0.6.4" -source = "git+https://github.com/jqnatividad/governor?branch=bump-dashmap-to-6.1.0#7668120b6f45211fcda3e2d4cc61c3f1d673a440" +source = "git+https://github.com/boinkor-net/governor?rev=ae92838#ae9283804efda307729ae85e829d3638d8f99b2d" dependencies = [ "cfg-if", "dashmap", diff --git a/Cargo.toml b/Cargo.toml index 278178619..ad5feae5b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -289,8 +289,8 @@ csvlens = { git = "https://github.com/YS-L/csvlens", rev = "b3fab72" } # see https://github.com/jan-auer/dynfmt/pull/9 dynfmt = { git = "https://github.com/jqnatividad/dynfmt", branch = "2021-clippy_ptr_as_ptr-bumpdeps" } -# use our fork of governor with bumped dashmap dependency -governor = { git = "https://github.com/jqnatividad/governor", branch = "bump-dashmap-to-6.1.0" } +# use latest upstream of governor with bumped dashmap dependency +governor = { git = "https://github.com/boinkor-net/governor", rev = "ae92838" } # needed to get latest dependencies and unreleased fixes grex = { git = "https://github.com/pemistahl/grex", rev = "ff8533d" } From a59bce2d1224d96be4a46d2df93cacbd296546b1 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 20 Oct 2024 17:35:46 -0400 Subject: [PATCH 099/119] `tests`: replace jql with jaq files --- resources/test/fetch_jaq_multiple.jaq | 1 + resources/test/fetch_jaq_single.jaq | 1 + resources/test/fetch_jql_multiple.jql | 1 - resources/test/fetch_jql_single.jql | 1 - 4 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 resources/test/fetch_jaq_multiple.jaq create mode 100644 resources/test/fetch_jaq_single.jaq delete mode 100644 resources/test/fetch_jql_multiple.jql delete mode 100644 resources/test/fetch_jql_single.jql diff --git a/resources/test/fetch_jaq_multiple.jaq b/resources/test/fetch_jaq_multiple.jaq new file mode 100644 index 000000000..0fab93a4e --- /dev/null +++ b/resources/test/fetch_jaq_multiple.jaq @@ -0,0 +1 @@ +[ ."places"[0]."place name" ,."places"[0]."state abbreviation" ] diff --git a/resources/test/fetch_jaq_single.jaq b/resources/test/fetch_jaq_single.jaq new file mode 100644 index 000000000..f472fc7b0 --- /dev/null +++ b/resources/test/fetch_jaq_single.jaq @@ -0,0 +1 @@ +."places"[0]."place name" diff --git a/resources/test/fetch_jql_multiple.jql b/resources/test/fetch_jql_multiple.jql deleted file mode 100644 index 87a680c92..000000000 --- a/resources/test/fetch_jql_multiple.jql +++ /dev/null @@ -1 +0,0 @@ -"places"[0]"place name","places"[0]"state abbreviation" diff --git a/resources/test/fetch_jql_single.jql b/resources/test/fetch_jql_single.jql deleted file mode 100644 index 6f96c9703..000000000 --- a/resources/test/fetch_jql_single.jql +++ /dev/null @@ -1 +0,0 @@ -"places"[0]"place name" From 4f235ce83c67d9f28c0aaa67cf1facf51ae882d2 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 20 Oct 2024 17:41:22 -0400 Subject: [PATCH 100/119] `deps`: remove jql-runner dependency. We now also use jaq for `fetch` & `fetchpost` which was introduced with the `json` command --- Cargo.lock | 127 +++++++++++++++++++++-------------------------------- Cargo.toml | 2 - 2 files changed, 51 insertions(+), 78 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 36bc68e8e..025629507 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -718,7 +718,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -758,7 +758,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -775,7 +775,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1024,7 +1024,7 @@ checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1106,7 +1106,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1312,7 +1312,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1823,7 +1823,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1847,7 +1847,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1858,7 +1858,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1947,7 +1947,7 @@ checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1958,7 +1958,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -1971,7 +1971,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -2020,7 +2020,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -2153,7 +2153,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -2174,7 +2174,7 @@ checksum = "de0d48a183585823424a4ce1aa132d174a6a81bd540895822eb4c8373a8e49e8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -2457,7 +2457,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -2584,7 +2584,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -3166,7 +3166,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -3224,7 +3224,6 @@ checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", "hashbrown 0.15.0", - "rayon", "serde", ] @@ -3269,7 +3268,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c" dependencies = [ "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -3404,29 +3403,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0" -[[package]] -name = "jql-parser" -version = "7.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc6c5d45258356a8b4ff8265b929cc95880be34fdc34c884e0ab4585d4a3f356" -dependencies = [ - "thiserror", - "winnow", -] - -[[package]] -name = "jql-runner" -version = "7.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d68343315f3a2668bf340993d0ae2686bed277d2fb74b28a083fc50fd1db44" -dependencies = [ - "indexmap", - "jql-parser", - "rayon", - "serde_json", - "thiserror", -] - [[package]] name = "js-sys" version = "0.3.72" @@ -4540,7 +4516,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -4569,7 +4545,7 @@ checksum = "a4502d8515ca9f32f1fb543d987f63d95a14934883db45bdb48060b6b69257f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -5296,7 +5272,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -5309,7 +5285,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -5366,7 +5342,6 @@ dependencies = [ "jaq-interpret", "jaq-parse", "jemallocator", - "jql-runner", "json-objects-to-csv", "jsonschema", "local-encoding", @@ -5763,7 +5738,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -5845,7 +5820,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -6296,7 +6271,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -6320,7 +6295,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -6368,7 +6343,7 @@ checksum = "82fe9db325bcef1fbcde82e078a5cc4efdf787e96b3b9cf45b50b529f2083d67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -6579,7 +6554,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -6731,7 +6706,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -6753,9 +6728,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.80" +version = "2.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e185e337f816bc8da115b8afcb3324006ccc82eeaddf35113888d3bd8e44ac" +checksum = "198514704ca887dd5a1e408c6c6cdcba43672f9b4062e1b24aa34e74e6d7faae" dependencies = [ "proc-macro2", "quote", @@ -6779,7 +6754,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -6860,7 +6835,7 @@ checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -6999,7 +6974,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -7095,7 +7070,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -7180,7 +7155,7 @@ checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -7476,7 +7451,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", "wasm-bindgen-shared", ] @@ -7510,7 +7485,7 @@ checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -7788,7 +7763,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -7799,7 +7774,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -7810,7 +7785,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -7821,7 +7796,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -8105,7 +8080,7 @@ checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", "synstructure", ] @@ -8156,7 +8131,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", "zvariant_utils", ] @@ -8189,7 +8164,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -8209,7 +8184,7 @@ checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", "synstructure", ] @@ -8230,7 +8205,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -8252,7 +8227,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] [[package]] @@ -8359,7 +8334,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", "zvariant_utils", ] @@ -8371,5 +8346,5 @@ checksum = "c51bcff7cc3dbb5055396bcf774748c3dab426b4b8659046963523cee4808340" dependencies = [ "proc-macro2", "quote", - "syn 2.0.80", + "syn 2.0.81", ] diff --git a/Cargo.toml b/Cargo.toml index ad5feae5b..15a570f2e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,7 +135,6 @@ jsonschema = { version = "0.23", features = [ "resolve-file", "resolve-http", ], default-features = false } -jql-runner = { version = "7.2", default-features = false, optional = true } local-encoding = { version = "0.2", optional = true } localzone = { version = "0.3", features = ["auto_validation"] } log = "0.4" @@ -356,7 +355,6 @@ fetch = [ "flate2", "governor", "hashbrown", - "jql-runner", "publicsuffix", "redis", "serde_stacker", From bcd9dd1ad4c6ac676b9b3cfd4c9e04d4df5eef3f Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 20 Oct 2024 17:55:26 -0400 Subject: [PATCH 101/119] `deps`: remove serde-stacker dependency which was required by jql --- Cargo.lock | 11 ----------- Cargo.toml | 2 -- 2 files changed, 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 025629507..c22b7d4ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5379,7 +5379,6 @@ dependencies = [ "semver", "serde", "serde_json", - "serde_stacker", "serde_urlencoded", "serial_test", "simd-json", @@ -6298,16 +6297,6 @@ dependencies = [ "syn 2.0.81", ] -[[package]] -name = "serde_stacker" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "babfccff5773ff80657f0ecf553c7c516bdc2eb16389c0918b36b73e7015276e" -dependencies = [ - "serde", - "stacker", -] - [[package]] name = "serde_urlencoded" version = "0.7.1" diff --git a/Cargo.toml b/Cargo.toml index 15a570f2e..1e988a825 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -220,7 +220,6 @@ self_update = { version = "0.41", features = [ semver = "1" serde = { version = "1", features = ["derive"] } serde_json = { version = "1", features = ["preserve_order"] } -serde_stacker = { version = "0.1", optional = true } serde_urlencoded = { version = "0.7", optional = true } simple-expand-tilde = { version = "0.4.3", optional = true } smallvec = "1" @@ -357,7 +356,6 @@ fetch = [ "hashbrown", "publicsuffix", "redis", - "serde_stacker", "serde_urlencoded", "simple-expand-tilde", ] From af54c38327e8c9259879fb193763a4e1741c93ed Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 20 Oct 2024 17:57:04 -0400 Subject: [PATCH 102/119] `fetch`: now uses jaq instead of jql --- src/cmd/fetch.rs | 351 +++++++++++++++++++++++++++++++---------------- 1 file changed, 229 insertions(+), 122 deletions(-) diff --git a/src/cmd/fetch.rs b/src/cmd/fetch.rs index 5aea8e224..9db4c68e9 100644 --- a/src/cmd/fetch.rs +++ b/src/cmd/fetch.rs @@ -2,7 +2,7 @@ static USAGE: &str = r#" Fetches data from web services for every row using HTTP Get. -Fetch is integrated with `jql` to directly parse out values from an API JSON response. +Fetch is integrated with `jaq` (a jq clone) to directly parse out values from an API JSON response. CACHE OPTIONS: Fetch caches responses to minimize traffic and maximize performance. It has four @@ -75,10 +75,10 @@ Given the data.csv above, fetch the JSON response. Note the output will be a JSONL file - with a minified JSON response per line, not a CSV file. Now, if we want to generate a CSV file with the parsed City and State, we use the -new-column and jql options. (See https://github.com/yamafaktory/jql#%EF%B8%8F-usage -for more info on how to use the jql JSON Query Language) +new-column and jaq options. (See https://github.com/01mf02/jaq?tab=readme-ov-file#examples +for more info on how to use the jaq JSON Query Language) -$ qsv fetch URL --new-column CityState --jql '"places"[0]"place name","places"[0]"state abbreviation"' +$ qsv fetch URL --new-column CityState --jaq '[ ."places"[0]."place name",."places"[0]."state abbreviation" ]' data.csv > data_with_CityState.csv data_with_CityState.csv @@ -87,10 +87,10 @@ data_with_CityState.csv https://api.zippopotam.us/us/94105, "[\"San Francisco\",\"CA\"]" https://api.zippopotam.us/us/92802, "[\"Anaheim\",\"CA\"]" -As you can see, entering jql selectors on the command line is error prone and can quickly become cumbersome. -Alternatively, the jql selector can be saved and loaded from a file using the --jqlfile option. +As you can see, entering jaq selectors on the command line is error prone and can quickly become cumbersome. +Alternatively, the jaq selector can be saved and loaded from a file using the --jaqfile option. - $ qsv fetch URL --new-column CityState --jqlfile places.jql data.csv > datatest.csv + $ qsv fetch URL --new-column CityState --jaqfile places.jaq data.csv > datatest.csv EXAMPLES USING THE --URL-TEMPLATE OPTION: @@ -117,10 +117,10 @@ $ qsv fetch --url-template "https://api.geocode.earth/v1/reverse?point.lat={lati Example 2: Geocode addresses in addresses.csv, pass the "street address" and "zip-code" fields -and use jql to parse placename from the JSON response into a new column in addresses_with_placename.csv. +and use jaq to parse placename from the JSON response into a new column in addresses_with_placename.csv. Note how field name non-alphanumeric characters (space and hyphen) in the url-template were replaced with _. -$ qsv fetch --jql '"features"[0]"properties","name"' addresses.csv -c placename --url-template +$ qsv fetch --jaq '."features"[0]."properties", ."name"' addresses.csv -c placename --url-template "https://api.geocode.earth/v1/search/structured?address={street_address}&postalcode={zip_code}" > addresses_with_placename.csv @@ -135,7 +135,7 @@ $ qsv fetch URL data.csv --http-header "X-Api-Key:TEST_KEY" -H "X-Api-Secret:ABC For more extensive examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_fetch.rs. Usage: - qsv fetch [ | --url-template