Skip to content

Commit

Permalink
fixup! Issue #17 initial steps to decouple code table IO from PrefixC…
Browse files Browse the repository at this point in the history
…odec
  • Loading branch information
soxofaan committed Jul 13, 2024
1 parent 4b6267d commit ebbf191
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 9 deletions.
1 change: 1 addition & 0 deletions dahuffman/codecs/json-compact.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type":"dahuffman code table","version":1,"code_table":[["-",5,0],["5",5,1],["{",7,8],["y",7,9],[".",6,5],["t",5,3],["9",5,4],["a",5,5],["D",7,24],["I",8,50],["z",9,102],["x",9,103],["d",6,13],["B",7,28],["E",7,29],["p",7,30],["J",11,496],["%",12,994],["'",14,3980],["Z",14,3981],[">",13,1991],["Y",10,249],["T",9,125],["O",9,126],["M",9,127],["2",5,8],["l",5,9],["3",5,10],["4",5,11],["\"",3,3],[":",6,32],["h",7,66],["q",10,536],["G",11,1074],["K",11,1075],["/",9,269],["P",9,270],["L",9,271],["e",5,17],["n",5,18],["v",8,152],["_",8,153],["H",9,308],["k",9,309],["\\",8,155],["s",6,39],[" ",5,20],["C",7,84],["b",8,170],["S",8,171],["]",8,172],["[",8,173],["A",7,87],["0",4,11],[",",4,12],["u",6,52],["~",10,848],["X",11,1698],["*",13,6796],["=",14,13594],["+",16,54380],[";",16,54381],["&",16,54382],["?",19,435065],["#",18,217533],["<",17,108767],["V",12,3399],[")",10,850],["(",10,851],["f",8,213],["m",7,107],["1",5,27],["r",6,56],["g",8,228],["w",8,229],["N",8,230],["j",10,924],["Q",12,3700],["@",12,3701],["W",11,1851],["R",10,926],["U",10,927],["i",6,58],["8",6,59],["o",6,60],["7",6,61],["6",6,62],["F",8,252],["}",8,253],["c",7,127]],"eof_code":[19,435064],"metadata":{"frequencies":{"{":7773,"\"":167778,"m":12999,"e":40821,"t":34222,"a":34406,":":19344,"v":5582,"i":29239,"w":6928,"d":17319,"k":3013,"u":25503,"6":30930,"-":31180,"n":41836,"x":2274,",":101928,"D":8359,"o":29553,"g":6808,"r":27010,"p":9126,"h":10043,"c":15557," ":46824,"S":6054,"s":23004,"B":8840,"y":7924,"Z":84,"C":11851,"b":5917,"f":6500,"Y":1210,"l":37539,"(":1582,")":1581,"R":1864,"0":99430,"G":642,"A":12529,"1":52881,"3":38723,"7":30603,"5":32443,"4":39551,"z":2211,"T":2361,"8":29389,"2":36755,"F":7721,"J":545,"U":1887,"9":34397,"O":2411,"E":9084,"I":4262,"L":2738,"M":2483,"[":6124,"N":7237,"_":5684,"}":7761,"]":6104,"j":1773,".":16877,"P":2724,"K":710,"W":936,"H":2761,"V":403,"/":2641,"q":1246,"?":3,"~":1517,"\\":5915,";":24,"X":762,"Q":419,"'":60,"*":180,"@":441,"&":27,"%":265,"<":18,">":146,"+":20,"=":92,"#":9}},"concat":"str_join"}
1 change: 1 addition & 0 deletions dahuffman/codecs/json.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type":"dahuffman code table","version":1,"code_table":[[" ",2,0],["4",5,8],["p",7,36],["J",11,592],["'",14,4744],[";",15,9490],["&",15,9491],[">",13,2373],["\u2019",16,18992],["<",16,18993],["\u00b3",20,303905],["\u00b5",20,303906],["\u2018",20,303907],["\u201c",20,303908],["\u201d",20,303909],["?",19,151955],["#",17,37989],["+",16,18995],["\u2013",14,4749],["*",13,2375],["Y",10,297],["T",9,149],["O",9,150],["M",9,151],["h",7,38],["q",10,312],["G",11,626],["K",11,627],["P",9,157],["\\",8,79],["n",5,10],["e",5,11],[":",6,24],["H",9,200],["L",9,201],["v",8,101],["C",7,51],["s",6,26],["/",9,216],["~",10,434],["(",10,435],["_",8,109],["]",8,110],["[",8,111],["0",4,7],[",",4,8],["S",8,144],["k",9,290],[")",10,582],["X",11,1166],["V",12,2334],["=",14,9340],["Z",14,9341],["%",13,4671],["A",7,73],["u",6,37],["1",5,19],["b",8,160],["f",8,161],["m",7,81],["r",6,41],["8",6,42],["w",8,172],["g",8,173],["N",8,174],["j",10,700],["Q",12,2804],["@",12,2805],["W",11,1403],["R",10,702],["U",10,703],["7",6,44],["6",6,45],["i",6,46],["-",6,47],["o",6,48],["5",6,49],["F",8,200],["y",8,201],["c",7,101],["}",8,204],["{",8,205],[".",7,103],["9",6,52],["D",8,212],["B",8,213],["I",9,428],["z",10,858],["x",10,859],["E",8,215],["a",6,54],["2",6,55],["t",6,56],["d",7,114],["\n",7,115],["l",6,58],["3",6,59],["\"",4,15]],"eof_code":[20,303904],"metadata":{"frequencies":{"{":7221,"\n":16612," ":274544,"\"":139628,"m":12060,"e":37559,"t":31650,"a":31095,":":18551,"v":4984,"i":26909,"w":6193,"d":15858,"k":2694,"u":22373,"6":26443,"-":26960,"n":37383,"x":2007,",":83974,"D":7319,"o":27011,"g":6197,"r":24175,"p":8411,"h":8986,"c":14405,"S":5342,"s":20773,"B":7595,"y":7193,"Z":83,"C":10275,"b":5411,"f":5884,"Y":1049,"l":32957,"(":1348,")":1348,"R":1633,"0":80916,"G":590,"A":10839,"1":45087,"3":33133,"7":26092,"5":27695,"4":34520,"z":1919,"T":2119,"8":25079,"2":31617,"F":6684,"J":459,"U":1711,"9":29803,"O":2178,"E":7787,"I":3761,"L":2424,"M":2202,"[":5294,"N":6502,"_":5261,"}":7207,"]":5271,"j":1582,".":14620,"P":2345,"K":656,"W":843,"H":2404,"V":338,"/":2499,"q":1077,"?":3,"~":1311,"\\":4819,"\u2013":73,";":24,"X":668,"Q":362,"'":50,"*":149,"@":423,"&":27,"%":198,"\u00b5":1,"\u00b3":1,"<":17,">":125,"+":20,"=":74,"#":9,"\u2019":10,"\u2018":1,"\u201c":1,"\u201d":1}},"concat":"str_join"}
1 change: 1 addition & 0 deletions dahuffman/codecs/shakespeare-lower.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type":"dahuffman code table","version":1,"code_table":[["r",4,0],["h",4,1],["n",4,2],["s",4,3],["b",6,16],["\u2019",8,68],["!",9,138],["z",11,556],["2",14,4456],["5",15,8914],["&",16,17830],["|",17,35662],["*",17,35663],["\u201d",14,4458],[")",14,4459],["(",14,4460],["8",15,8922],["6",15,8923],["\"",15,8924],["0",15,8925],["\u201c",14,4463],[":",10,279],["'",8,70],[";",8,71],["u",5,9],["i",4,5],["\n",5,12],["g",6,26],["k",7,54],["v",7,55],["a",4,7],["d",5,16],[".",6,34],["f",6,35],["o",4,9],["t",4,10],["l",5,22],["c",6,46],[",",6,47],[" ",3,6],["w",6,56],["y",6,57],["_",10,928],["j",10,929],["?",9,465],["x",10,932],["\u2014",12,3732],["\u2018",14,14932],["\u00e8",17,119464],["\u00e6",17,119465],["7",16,59733],["3",15,29867],["1",14,14934],["\u00e9",17,119480],["/",19,477924],["\u00e0",19,477925],["\u0153",21,1911704],["\u00ee",22,3823410],["#",23,7646823],["\t",22,3823412],["$",22,3823413],["%",23,7646828],["@",23,7646829],["\\",23,7646830],["`",23,7646831],["}",22,3823416],["\u00e2",22,3823417],["\u00ea",21,1911709],["\u00e7",20,955855],["9",16,59741],["4",15,29871],["]",11,1867],["-",10,934],["[",11,1870],["q",11,1871],["p",7,117],["m",6,59],["e",4,15]],"eof_code":[23,7646822],"metadata":{"frequencies":{"\n":138037,"p":61600,"r":252082,"o":332873,"j":4910,"e":481144,"c":92002,"t":354271," ":823018,"g":72877,"u":137495,"n":260496,"b":64105,"\u2019":14526,"s":266719,"h":255777,"m":117542,"l":180842,"w":96316,"k":37816,"f":86188,"i":269305,"a":309773,",":92277,"y":99531,"d":158820,"v":40214,".":83846,"-":6324,":":4523,"*":38,"(":294,"2":247,"0":173,"1":390,"7":94,")":293,"9":107,"4":218,"[":3333,"#":1,"]":3324,"3":188,"8":151,"x":5330,";":17964,"z":1840,"\u2018":361,"?":11061,"q":3953,"5":122,"6":158,"!":8591,"\u00e6":43,"&":62,"\u2014":1412,"\u201c":353,"\u201d":284,"_":4651,"\"":170,"'":17806,"|":32,"\u0153":2,"\u00e0":13,"\u00e9":45,"\u00e8":38,"\u00e2":2,"\u00e7":9,"\u00ee":1,"\u00ea":5,"`":1,"\t":2,"}":2,"\\":1,"/":12,"%":1,"@":1,"$":2}},"concat":"str_join"}
1 change: 1 addition & 0 deletions dahuffman/codecs/shakespeare-raw.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type":"dahuffman code table","version":1,"code_table":[[" ",2,0],["a",4,4],["d",5,10],["c",6,22],["E",7,46],["v",7,47],["o",4,6],["f",6,28],["T",7,58],["C",8,118],["x",10,476],["(",14,7632],["8",15,15266],["6",15,15267],["\"",15,15268],["%",22,1954432],["@",22,1954433],["\\",22,1954434],["`",22,1954435],["}",21,977218],["\u00e2",21,977219],["\u00e7",19,244305],["\u00c9",20,488612],["\u00ea",20,488613],["/",19,244307],["\u00e6",17,61077],["7",16,30539],["\u201c",14,7635],["\u2014",12,1909],["q",11,955],["G",9,239],["l",5,15],["t",4,8],["w",6,36],[".",6,37],["\n",5,19],["P",9,320],["?",9,321],["L",8,161],["A",7,81],["y",6,41],[",",6,42],["F",9,344],["K",10,690],["j",11,1382],["V",11,1383],["R",8,173],["b",7,87],["p",7,88],["N",8,178],["-",10,716],["]",11,1434],["[",11,1435],["U",9,359],["m",6,45],["i",5,23],["e",4,12],["r",5,26],["I",7,108],["D",9,436],["B",9,437],["O",8,219],["\u2019",9,440],["M",9,441],["S",8,221],["g",7,111],["n",5,28],["s",5,29],["h",5,30],["u",6,62],["k",8,252],["Y",10,1012],["0",15,32416],["3",15,32417],["\u2018",14,16209],["X",14,16210],["1",14,16211],["J",12,4053],[":",11,2027],["W",9,507],["H",9,508],["'",9,509],["!",10,1020],["\u00e9",17,130688],["\u0153",21,2091024],["\u00c6",21,2091025],["\u00ee",22,4182052],["#",23,8364107],["\t",22,4182054],["$",22,4182055],["\u00e0",19,522757],["|",18,261379],["9",16,65345],["4",15,32673],["2",15,32674],["5",16,65350],["&",17,130702],["*",18,261406],["\u00e8",18,261407],["\u201d",15,32676],[")",15,32677],["Z",14,16339],["Q",13,8170],["z",13,8171],["_",11,2043],[";",9,511]],"eof_code":[23,8364106],"metadata":{"frequencies":{"\n":164202,"P":10725,"r":227404,"o":304523,"j":3050,"e":444276,"c":73035,"t":315370," ":1104779,"G":10529,"u":124306,"n":235066,"b":50694,"g":62348,"\u2019":14526,"s":235725,"T":38901,"h":238755,"C":18967,"m":102883,"p":50875,"l":158708,"W":16698,"k":31929,"f":74755,"i":217354,"a":264813,"S":30994,",":92277,"y":92141,"B":13411,"w":79618,"U":13189,"d":145498,"v":37036,".":83846,"Y":7390,"-":6324,"L":22134,"I":51951,":":4523,"*":38,"O":28350,"N":25430,"E":36868,"(":294,"2":247,"0":173,"1":390,"7":94,")":293,"A":44960,"R":24678,"D":13322,"J":1860,"9":107,"4":218,"[":3333,"#":1,"]":3324,"M":14659,"3":188,"F":11433,"8":151,"H":17022,"K":5887,"X":382,"V":3178,";":17964,"z":1240,"\u2018":361,"x":4948,"?":11061,"q":2725,"5":122,"6":158,"!":8591,"\u00e6":40,"&":62,"\u2014":1412,"\u201c":353,"\u201d":284,"_":4651,"Q":1228,"\"":170,"'":17806,"|":32,"Z":600,"\u0153":2,"\u00c6":3,"\u00e0":13,"\u00c9":5,"\u00e9":40,"\u00e8":38,"\u00e2":2,"\u00e7":9,"\u00ee":1,"\u00ea":5,"`":1,"\t":2,"}":2,"\\":1,"/":12,"%":1,"@":1,"$":2}},"concat":"str_join"}
1 change: 1 addition & 0 deletions dahuffman/codecs/shakespeare.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type":"dahuffman code table","version":1,"code_table":[["n",4,0],["s",4,1],["h",4,2],["u",5,6],["k",7,28],["Y",9,116],["0",14,3744],["3",14,3745],["\u2018",13,1873],["X",13,1874],["1",13,1875],["J",11,469],[":",10,235],["W",8,59],["H",8,60],["'",8,61],["!",9,124],["\u00e9",16,16000],["\u0153",20,256016],["\u00c6",20,256017],["\u00ee",21,512036],["#",22,1024075],["\t",21,512038],["$",21,512039],["\u00e0",18,64005],["|",17,32003],["9",15,8001],["4",14,4001],["2",14,4002],["5",15,8006],["&",16,16014],["*",17,32030],["\u00e8",17,32031],["\u201d",14,4004],[")",14,4005],["Z",13,2003],["Q",12,1002],["z",12,1003],["_",10,251],[";",8,63],["a",4,4],["\n",5,10],["d",5,11],["c",6,24],["E",7,50],["v",7,51],["f",6,26],["T",7,54],["C",8,110],["x",10,444],["(",14,7120],["8",15,14242],["6",15,14243],["\"",15,14244],["%",22,1823360],["@",22,1823361],["\\",22,1823362],["`",22,1823363],["}",21,911682],["\u00e2",21,911683],["\u00e7",19,227921],["\u00c9",20,455844],["\u00ea",20,455845],["/",19,227923],["\u00e6",17,56981],["7",16,28491],["\u201c",14,7123],["\u2014",12,1781],["q",11,891],["G",9,223],["o",4,7],["t",4,8],["l",5,18],["w",6,38],[".",6,39],["P",9,320],["?",9,321],["L",8,161],["A",7,81],["y",6,41],[",",6,42],["F",9,344],["K",10,690],["j",11,1382],["V",11,1383],["R",8,173],["b",7,87],["p",7,88],["N",8,178],["-",10,716],["]",11,1434],["[",11,1435],["U",9,359],["m",6,45],["i",5,23],[" ",3,6],["e",4,14],["r",5,30],["I",7,124],["D",9,500],["B",9,501],["O",8,251],["\u2019",9,504],["M",9,505],["S",8,253],["g",7,127]],"eof_code":[22,1024074],"metadata":{"frequencies":{"\n":138037,"P":10725,"r":227404,"o":304523,"j":3050,"e":444276,"c":73035,"t":315370," ":823018,"G":10529,"u":124306,"n":235066,"b":50694,"g":62348,"\u2019":14526,"s":235725,"T":38901,"h":238755,"C":18967,"m":102883,"p":50875,"l":158708,"W":16698,"k":31929,"f":74755,"i":217354,"a":264813,"S":30994,",":92277,"y":92141,"B":13411,"w":79618,"U":13189,"d":145498,"v":37036,".":83846,"Y":7390,"-":6324,"L":22134,"I":51951,":":4523,"*":38,"O":28350,"N":25430,"E":36868,"(":294,"2":247,"0":173,"1":390,"7":94,")":293,"A":44960,"R":24678,"D":13322,"J":1860,"9":107,"4":218,"[":3333,"#":1,"]":3324,"M":14659,"3":188,"F":11433,"8":151,"H":17022,"K":5887,"X":382,"V":3178,";":17964,"z":1240,"\u2018":361,"x":4948,"?":11061,"q":2725,"5":122,"6":158,"!":8591,"\u00e6":40,"&":62,"\u2014":1412,"\u201c":353,"\u201d":284,"_":4651,"Q":1228,"\"":170,"'":17806,"|":32,"Z":600,"\u0153":2,"\u00c6":3,"\u00e0":13,"\u00c9":5,"\u00e9":40,"\u00e8":38,"\u00e2":2,"\u00e7":9,"\u00ee":1,"\u00ea":5,"`":1,"\t":2,"}":2,"\\":1,"/":12,"%":1,"@":1,"$":2}},"concat":"str_join"}
1 change: 1 addition & 0 deletions dahuffman/codecs/xml.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type":"dahuffman code table","version":1,"code_table":[["6",6,0],["T",10,16],["U",10,17],["~",9,9],["q",8,5],["N",8,6],["Q",12,112],["\\",12,113],["K",11,57],[")",10,29],["\n",9,15],["9",6,2],["w",6,3],["y",6,4],["z",8,20],[",",8,21],["(",10,88],["%",13,712],["Z",15,2852],["[",15,2853],["\u2013",14,1427],["{",13,714],["}",13,715],["X",11,179],["R",10,90],["Y",11,182],["J",12,366],["*",13,734],["]",15,2940],["\t",15,2941],["|",14,1471],["x",8,23],["g",6,6],[".",6,7],["a",4,2],["u",5,6],["F",8,56],[":",8,57],["v",7,29],["3",6,15],["b",7,32],["B",8,66],["D",8,67],["l",6,17],["/",5,9],["t",4,5],["j",8,96],["E",8,97],["f",7,49],["2",6,25],["m",6,26],["p",6,27],["d",5,14],["c",5,15],["e",4,8],["h",6,36],["I",9,296],["H",10,594],["O",10,595],["P",10,596],["W",12,2388],["V",12,2389],["#",13,4780],["'",14,9562],["?",16,38252],["+",17,76506],["@",18,153014],["\u200c",19,306030],["\ufeff",21,1224125],["\u00f1",20,612063],["!",15,19127],["G",12,2391],["&",10,598],["M",10,599],["=",7,75],["<",5,19],[">",5,20],["1",6,42],["-",6,43],["s",5,22],["\"",6,46],["5",7,94],["L",10,760],[";",10,761],["S",9,381],["C",8,191],["0",5,24],["n",5,25],["_",5,26],["o",5,27],["i",5,28],["8",7,116],["7",7,117],["4",7,118],["A",8,238],["k",8,239],[" ",5,30],["r",5,31]],"eof_code":[21,1224124],"metadata":{"frequencies":{"<":46938,"r":59681,"e":90285,"s":50883,"p":20779,"o":55844,"n":55269,">":46983,"w":14986," ":57483,"_":55491,"i":57191,"d":41552,"=":12621,"\"":25423,"-":25348,"a":71880,"b":9330,"~":1905,"8":14107,"5":13388,"f":10425,"z":3910,"k":7305,"c":42556,"u":35712,"0":53392,"4":14407,"1":24184,"D":4802,"A":7226,"9":14922,"6":14904,"F":4441,"B":4775,"t":80992,"h":21868,":":4540,"/":38985,".":17228,"y":15221,"x":4391,"j":4977,"m":20687,"l":19812,"2":20584,"g":16871,"3":18132,"7":14377,"v":9120,"q":3682,"E":5051,"C":7077,"N":3713,",":4127,"(":993,")":992,"K":484,"V":320,"I":2495,"*":154,"U":915,"X":530,"Y":541,"S":3627,"W":315,"M":1496,"J":282,"R":1112,"O":1304,"'":86,"G":403,"H":1268,"Q":212,"L":1532,"P":1389,"%":124,"T":827,"Z":30,"\u2013":68,"#":163,"&":1393,";":1584,"!":53,"+":13,"\n":1947,"[":30,"]":30,"?":21,"|":78,"\\":231,"{":137,"}":137,"\t":48,"\u00f1":3,"\u200c":3,"\ufeff":2,"@":7}},"concat":"str_join"}
2 changes: 1 addition & 1 deletion dahuffman/codetableio.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def json_save(
path = Path(path)
ensure_dir(path.parent)
with path.open("w", encoding="utf8") as f:
json.dump(obj=data, fp=f)
json.dump(obj=data, fp=f, indent=None, separators=(",", ":"))
_log.info(
f"Saved {type(codec).__name__} code table ({len(code_table)} items) to {str(path)!r}"
)
Expand Down
23 changes: 20 additions & 3 deletions train/json-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections import Counter

from dahuffman import HuffmanCodec
from dahuffman.codetableio import json_save, pickle_save
from train.train_utils import CODECS, download

_log = logging.getLogger()
Expand Down Expand Up @@ -54,12 +55,28 @@ def main():
# TODO add more metadata
_log.info(f"Frequencies raw {len(frequencies_raw)}: {frequencies_raw}")
codec = HuffmanCodec.from_frequencies(frequencies_raw)
codec.save(CODECS / "json.pickle", metadata={"frequencies": frequencies_raw})
pickle_save(
codec=codec,
path=CODECS / "json.pickle",
metadata={"frequencies": frequencies_raw},
)
json_save(
codec=codec,
path=CODECS / "json.json",
metadata={"frequencies": frequencies_raw},
)

_log.info(f"Frequencies compact {len(frequencies_compact)}: {frequencies_compact}")
codec = HuffmanCodec.from_frequencies(frequencies_compact)
codec.save(
CODECS / "json-compact.pickle", metadata={"frequencies": frequencies_compact}
pickle_save(
codec=codec,
path=CODECS / "json-compact.pickle",
metadata={"frequencies": frequencies_compact},
)
json_save(
codec=codec,
path=CODECS / "json-compact.json",
metadata={"frequencies": frequencies_compact},
)


Expand Down
34 changes: 30 additions & 4 deletions train/shakespeare.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from collections import Counter

from dahuffman import HuffmanCodec
from dahuffman.codetableio import json_save, pickle_save
from train.train_utils import CODECS, download

_log = logging.getLogger()
Expand All @@ -27,7 +28,16 @@ def main():
frequencies = Counter(raw)
_log.info(f"Frequencies {len(frequencies)}: {frequencies}")
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(CODECS / "shakespeare-raw.pickle", metadata={"frequencies": frequencies})
pickle_save(
codec=codec,
path=CODECS / "shakespeare-raw.pickle",
metadata={"frequencies": frequencies},
)
json_save(
codec=codec,
path=CODECS / "shakespeare-raw.json",
metadata={"frequencies": frequencies},
)

_log.info("Doing white space clean up")
clean = raw
Expand All @@ -36,15 +46,31 @@ def main():
frequencies = Counter(clean)
_log.info(f"Frequencies {len(frequencies)}: {frequencies}")
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(CODECS / "shakespeare.pickle", metadata={"frequencies": frequencies})
pickle_save(
codec=codec,
path=CODECS / "shakespeare.pickle",
metadata={"frequencies": frequencies},
)
json_save(
codec=codec,
path=CODECS / "shakespeare.json",
metadata={"frequencies": frequencies},
)

_log.info("Only handling lower case")
lower = clean.lower()
frequencies = Counter(lower)
_log.info(f"Frequencies {len(frequencies)}: {frequencies}")
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(
CODECS / "shakespeare-lower.pickle", metadata={"frequencies": frequencies}
pickle_save(
codec=codec,
path=CODECS / "shakespeare-lower.pickle",
metadata={"frequencies": frequencies},
)
json_save(
codec=codec,
path=CODECS / "shakespeare-lower.json",
metadata={"frequencies": frequencies},
)


Expand Down
8 changes: 7 additions & 1 deletion train/xml-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import Counter

from dahuffman import HuffmanCodec
from dahuffman.codetableio import json_save, pickle_save
from train.train_utils import CODECS, download

_log = logging.getLogger()
Expand Down Expand Up @@ -49,7 +50,12 @@ def main():
# TODO add more metadata
_log.info(f"Frequencies raw {len(frequencies)}: {frequencies}")
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(CODECS / "xml.pickle", metadata={"frequencies": frequencies})
pickle_save(
codec=codec, path=CODECS / "xml.pickle", metadata={"frequencies": frequencies}
)
json_save(
codec=codec, path=CODECS / "xml.json", metadata={"frequencies": frequencies}
)


if __name__ == "__main__":
Expand Down

0 comments on commit ebbf191

Please sign in to comment.