-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenizer.py
55 lines (47 loc) · 14.8 KB
/
Tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
class Tokenizer:
def __init__(self):
self.token_list = ['|<eot_token>|', 'K', 'k', 'Q', 'q', 'R', 'r', 'B', 'b', 'N', 'n', 'P', 'p', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '/', '{', '}', ',', ':', '"', 'w', '-', ' ', '{"', '"}', '":"', '","', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'g1', 'g2', 'g3', 'g4', 'g5', 'g6', 'g7', 'g8', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'FEN', 'Phase', 'OpeningTags', 'Goal', 'Motif', 'Length', 'Mate', 'Moves', 'Rating', 'opening', 'middlegame', 'endgame', 'rookEndgame', 'bishopEndgame', 'pawnEndgame', 'knightEndgame', 'queenEndgame', 'queenRookEndgame', 'advancedPawn', 'attackingF2F7', 'capturingDefender', 'discoveredAttack', 'doubleCheck', 'exposedKing', 'fork', 'hangingPiece', 'kingsideAttack', 'pin', 'queensideAttack', 'sacrifice', 'skewer', 'trappedPiece', 'attraction', 'clearance', 'defensiveMove', 'deflection', 'interference', 'intermezzo', 'quietMove', 'xRayAttack', 'zugzwang', 'mate', 'mateIn1', 'mateIn2', 'mateIn3', 'mateIn4', 'mateIn5', 'anastasiaMate', 'arabianMate', 'backRankMate', 'bodenMate', 'doubleBishopMate', 'dovetailMate', 'hookMate', 'smotheredMate', 'equality', 'advantage', 'crushing', 'oneMove', 'short', 'long', 'veryLong', 'Pseudo', 'Queens', 'Indian', 'Defense', 'Other', 'variations', 'Grob', 'Opening', 'Gambit', 'Bird', 'Dutch', 'Variation', 'Declined', 'Ragozin', 'Danish', 'Modern', 'Albin', 'Countergambit', 'Caro-Kann', 'Panov', 'Attack', 'Three', 'Knights', 'Zukertort', 'English', 'Agincourt', 'Scandinavian', 'Icelandic-Palme', 'French', 'Steinitz', 'Sicilian', 'Lasker-Pelikan', 'Horwitz', 'Pirc', 'Kholmov', 'System', 'Vant', 'Kruijs', 'Scotch', 'Game', 'Pawn', 'Alekhine', 'Four', 'Halloween', 'Ruy', 'Lopez', 'Morphy', 'Italian', 'Bishops', 'Boden-Kieseritzky', 'Exchange', 'Kings', 'Accepted', 'MacLeod', 'Philidor', 'Anti-Fried', 'Liver', 'Old', 'Giuoco', 'Pianissimo', 'Russian', 'Salvio', 'Grand', 'Prix', 'Slav', 'Hyperaccelerated', 'Fianchetto', 'Vienna', 'Stanley', 'Wayward', 'Queen', 'Two', 'Knight', 'Tarrasch', 'Nyezhmetdinov-Rossolimo', 'Makogonov', 'Moscow', 'Max', 'Lange', 'Spanish', 'Najdorf', 'McConnell', 'Smith-Morra', 'Rousseau', 'Ponziani', 'Jaenisch', 'Counterattack', 'Czech', 'Baltic', 'Mieses-Kotroc', 'Advance', 'Austrian', 'Weber', 'Kan', 'Nimzowitsch', 'Kalashnikov', 'Wing', 'Englund', 'Complex', 'Budapest', 'The', 'Whale', 'Forgacs', 'Invitation', 'Nimzo-Larsen', 'Snyder', 'Benoni', 'Classical', 'Kennedy', 'Anglo-Indian', 'Polish', 'Stafford', 'Anti-Grunfeld', 'Katalimov', 'Symmetrical', 'Hartlaub-Charlick', 'Hungarian', 'Normal', 'Alapin', 'Evans', 'Semi-Slav', 'Semi-Meran', 'Latvian', 'Fraser', 'Marshall', 'Mediterranean', 'Main', 'Line', 'Closed', 'Hopton', 'Variations', 'Delayed', 'Cozio', 'Harrwitz', 'OKelly', 'Quiet', 'Accelerated', 'London', 'Colle', 'Elephant', 'Chigorin', 'Yugoslav', 'Central', 'Burn', 'Van', 'Geet', 'Soller', 'Pin', 'Pseudo-Tarrasch', 'Haxo', 'Reti', 'Carr', 'Dragon', 'Piano', 'Nimzo-Indian', 'Lowenthal', 'Owen', 'Barnes', 'Walkerling', 'Williams', 'Becker', 'Goring', 'Open', 'Boi', 'Blackburne-Kostic', 'Anti-Nimzo-Indian', 'Bowdler', 'Froms', 'Yusupov-Rubinstein', 'Franco-Nimzowitsch', 'Tennison', 'Center', 'Berger', 'Berlin', 'Showalter', 'Eisenberg', 'Pawns', 'Grunfeld', 'Blackmar-Diemer', 'Euwe', 'Rapport-Jobava', 'McDonnell', 'Mieses', 'Semi-Tarrasch', 'Maroczy', 'Paris', 'Mexican', 'Rubinstein', 'Barmen', 'Rat', 'Small', 'Standard', 'Richter-Rauzer', 'Hybrid', 'Tubingen', 'Schmidt', 'Spielmann-Indian', 'Hanham', 'Krause', 'Neo-Grunfeld', 'Staunton-Cochrane', 'Mason', 'Goldman', 'Taimanov', 'Formation', 'Black', 'Mustang', 'Scheveningen', 'Hillbilly', 'Ware', 'Paulsen', 'Mengarini', 'Valencian', 'Franco-Sicilian', 'Leonardis', 'Fischer', 'Spassky', 'Defensive', 'Prins', 'Lewis', 'Stonewall', 'Karpov', 'Paulsen-Basman', 'Levitsky', 'Tartakower', 'Saduleto', 'Harmonist', 'Hubsch', 'Blackmar', 'Orthoschnapp', 'Larsen', 'Borg', 'Lion', 'Spike', 'Czech-Indian', 'Felbecker', 'Blachly', 'Busch-Gass', 'Drazic', 'Veresov', 'Trompowsky', 'La', 'Bourdonnais', 'Mikenas', 'Winawer', 'von', 'der', 'Lasa', 'Wade-Tartakower', 'Sozin', 'Keres', 'Neo-Catalan', 'Canal', 'Breyer', 'Torre', 'Anderssen', 'Catalan', 'Anglo-Scandinavian', 'Anglo-Dutch', 'Falkbeer', 'Anti-Torre', 'Romanishin', 'Mlotkowski', 'Potter', 'Benko', 'Gedults', 'Great', 'Snake', 'Lolli', 'Gubinsky-Melts', 'Rasa-Studier', 'Morris', 'Saragossa', 'Campomanes', 'Gunderam', 'Leningrad', 'Reversed', 'Kramnik', 'Kieseritzky', 'Ryder', 'Blackburne-Kloosterboer', 'Head', 'Ross', 'Anderssens', 'Move', 'Order', 'Benoni-Indian', 'Samisch', 'Traditional', 'Carls-Bremen', 'Rosenthal', 'Orthodox', 'Anglo-Slav', 'Damiano', 'with', 'Ghulam-Kassim', 'Kadas', 'Hennig', 'Cochrane', 'Lemberger', 'Cunningham', 'Byrne', 'Kveinis', 'Meran', 'Khan', 'Fritz', 'Averbakh', 'Kangaroo', 'Stein', 'Balogh', 'Schliemann', 'Kingside', 'Chekhover', 'Halasz-McDonnell', 'Bronstein', 'Smith', 'Mayet', 'Cambridge', 'Springs', 'Steiner', 'Duras', 'Urusov', 'Canard', 'Ukrainian', 'Mongredien', 'Ranken', 'Reshevsky', 'Macleod', 'Orangutan', 'Kotov', 'Schilling-Kostic', 'Sorensen', 'Copenhagen', 'Clam', 'Boehnke', 'Norwegian', 'Pseudo-Austrian', 'Anti-Philidor', 'Neo-Orthodox', 'Transfer', 'Zinnowitz', 'Smyslov', 'Anglo-Lithuanian', 'Nimzo-American', 'Nf3', 'Schara', 'Allgaier', 'Lasker', 'Antal', 'Kloosterboer', 'Staunton', 'Goldsmith', 'Hobbs', 'Mengarinis', 'Schallopp', 'Benima', 'Zeller', 'Muzio', 'Deutz', 'Dunst-Perrenet', 'Seirawan', 'St', 'George', 'Ziegler', 'Wagner', 'Bogo-Indian', 'Tayler', 'Gunsberg', 'Gellers', 'Carlson', 'Noteboom', 'Kasparov-Petrosian', 'Bishop', 'Calabrese', 'Wade', 'Mikenas-Carls', 'Weinsbach', 'Declination', 'Godiva', 'Schlechter', 'Kluever', 'Raphael', 'Krebs', 'Dresden', 'Bonet', 'Nurnberg', 'Weenink', 'Geller', 'Lasker-Dunne', 'Kasparov', 'Ultra-Delayed', 'Richter-Veresov', 'East', 'Chebanenko', 'Mannheim', 'Abbazia', 'Portuguese', 'Double', 'Rosentreter', 'Konstantinopolsky', 'El', 'Columpio', 'Portsmouth', 'Bulgarian', 'Amar', 'Gurgenidze', 'Colorado', 'Charousek', 'Chinese', 'Frankenstein-Dracula', 'Omega', 'Deferred', 'Pterodactyl', 'Mafia', 'Blackmars', 'Second', 'Blackburne', 'Outflank', 'Apocalypse', 'Lisitsyn', 'Omaha', 'Millennium', 'Devin', 'Stoltz', 'Clemenz', 'Paleface', 'Wagner-Zwitersch', 'Flohr', 'Lions', 'Jaw', 'Petersburg', 'Brinckmann', 'Warsaw', 'Bronstein-Larsen', 'Netherlands', 'Queenside', 'Neumann', 'Caro', 'Bonsch-Osmolovsky', 'Battambang', 'Basman', 'Malaniuk', 'Mokele', 'Mbembe', 'Panteldakis', 'Alessi', 'Return', 'Keene', 'Dougherty', 'Napoleon', 'Keenes', 'Ulysses', 'Naselwaus', 'Romih', 'Gedult', 'Jerome', 'Dubov', 'Myers', 'Petrovs', 'Maddigan', 'Krejcik', 'Anti-Moscow', 'Teichmann', 'Vukovic', 'Richter', 'Lamb', 'Greco', 'Capablanca', 'Zilbermints', 'Semi-Classical', 'MacCutcheon', 'Blumenfeld', 'Dzindzi-Indian', 'Stockholm', 'Western', 'Przepiorka', 'Anti-Tartakower', 'Anglo-Grunfeld', 'New', 'Pierce', 'Janowski', 'Pachman', 'Pelikan', 'Beyer', 'Omega-Delta', 'Quinteros', 'Hall', 'Vos', 'Polerio', 'Novosibirsk', 'Push', 'Edge', 'Langeheinecke', 'Spielmann', 'Kopec', 'Amazon', 'Siberian', 'Hippopotamus', 'Alekhine-Chatard', 'Zaire', 'OSullivan', 'Schiffler-Sokolsky', 'Bugayev', 'Diemer', 'Santasiere', 'Diemer-Duhm', 'Brussels', 'Popiel', 'Hubner', 'Franco-Hiva', 'Miles', 'Herrstrom', 'Reti-Spielmann', 'Petrosian', 'Hromadka', 'Bilguer', 'Retreat', 'Botvinnik', 'Tortoise', 'Karklins-Martinovsky', 'Bogoljubov', 'Finnish', 'Semi-Benoni', 'Pseudo-Spanish', 'Riumin', 'Schiller-Pytel', 'Been-Koomen', 'Zaitsev', 'Barry', 'Matovinsky', 'Janowski-Larsen', 'Gaw-Paw', 'Connection', 'Semi-Averbakh', 'Birmingham', 'Kupreichik', 'Creepy', 'Crawly', 'Lemming', 'Lanc-Arnold', 'Boleslavsky', 'Westphalian', 'Fully', 'Albin-Blackburne', 'Bayonet', 'Coles', 'Jalalabad', 'Fried', 'Fox', 'Roscher', 'King', 'Davids', 'Chameleon', 'Quade', 'Kiel', 'Perseus', 'Vinogradov', 'Wahls', 'Lucena', 'Brooklyn', 'Non-', 'or', 'Kmoch', 'Birds', 'Hamppe-Allgaier', 'Dodo', 'Raptor', 'Opocensky', 'Eastern', 'Giraffe', 'Counterthrust', 'West', 'Achilles-Omega', 'Venice', 'Robatsch', 'Hanstein', 'Walk', 'Bucker', 'Wagenbach', 'Soultanbeieff', 'Speers', 'Vitzthum', 'Lithuanian', 'Hunt', 'Kazakh', 'Squirrel', 'Foltys-Leonhardt', 'Labahn', 'Noahs', 'Ark', 'Trap', 'Australian', 'Simul', 'Special', 'Hjrring', 'Prague', 'Crab', 'Wade-Smyslov', 'Hector', 'Masi', 'Hornung', 'Relfsson', 'Irish', '150', 'Grigorian', 'Manhattan', 'Kramer', 'Duck', 'Schulze-Muller', 'Bogoljubow', 'Goglidze', 'Hamppe-Muzio', 'Rotary-Albany', 'Valencia', 'Dus-Khotimirsky', 'Kaufmann', 'Neo-Mongoloid', 'Halasz', 'Wasp', 'Norwalde', 'Pseudo-Benko', 'Walrus', 'Swedish', 'Langeheinicke', 'Santasieres', 'Folly', 'Graz', 'Gianutio', 'Hooydoon', 'Hammerschlag', 'Boden', 'Ultra-delayed', 'Meadow', 'Hay', 'Gibbins-Weidenhagen', 'Petruccioli', 'Cannstatter', 'Bellon', 'Woodchuck', 'Braune', 'Janzen-Korchnoi', 'Troon', 'Mongoose', 'Yates', 'Korchnoi', 'Hobbs-Zilbermints', 'Hawk', 'Brick', 'Diemer-Rosenberg', 'Sturm', 'Pseudo-Samisch', 'Lazard', 'Dory', 'Wind', 'Guatemala', 'De', 'Bruycker', 'Gibbins-Wiedenhagen', 'Sodium', 'Randspringer', 'Kronberger', 'Norfolk', 'Linares', 'Potato', 'Hamppe-Meitner', 'Welling', 'Magnus', 'Lizard', 'Leonhardt', 'Tartakower-Indian', 'Global', 'Cormorant', 'Semi-Leningrad', 'Boyce', 'Vitolins', 'Pietrowsky', 'Zhuravlev', 'Montevideo', 'Poisoned', 'Kuijk', 'Sveshnikov', 'Anti-Queens', 'Mason-Keres', 'Mujannah', 'Brombacher', 'Henneberger', 'Woozle', 'Arctic', 'Simagin', 'Meitner', 'Elbert', 'Dlugy', 'Toikkanen', 'Rooks', 'Swap', 'Cochrane-Shumov', 'Foltys', 'Nescafe', 'Frappe', 'Horsefly', 'Mosquito', 'Pollock', 'Steinitz-Rosenthal', 'Beefeater', 'Storm', 'Soller-Zilbermints', 'Svenonius', 'Corkscrew', 'Sosonko', 'Kingfisher', 'Snagglepuss', 'Drill', 'German', 'Baeuerle', 'Godes', 'Adorjan', 'Uhlmann', 'Hartlaub', 'Lutikov', 'Pyrenees', 'Speelsmet', 'Beginners', 'Nowokunski', 'Tumbleweed', 'Banzai-Leong', 'Vulture', 'Big', 'Clamp', 'Hort-Antoshin', 'Bongcloud', 'Stamma', 'Chandler', 'Hekili-Loa', 'Silberschmidt', 'Nimzo-Dutch', 'Marienbad', 'Bavarian', 'Rhamphorhynchus', 'Neo-Modern', 'Haiti', 'Kaulich', 'Liedmann', 'Pillsbury', 'Billockus-Johansen', 'Perrin', 'Storming', 'Suchting', 'ns-Wiedenhagen', 'Sodium', 'Randspringer', 'Kronberger', 'Norfolk', 'Linares', 'Potato', 'Hamppe-Meitner', 'Welling', 'Magnus', 'Lizard', 'Leonhardt', 'Tartakower-Indian', 'Global', 'Cormorant', 'Semi-Leningrad', 'Boyce', 'Vitolins', 'Pietrowsky', 'Zhuravlev', 'Montevideo', 'Poisoned', 'Kuijk', 'Sveshnikov', 'Anti-Queens', 'Mason-Keres', 'Mujannah', 'Brombacher', 'Henneberger', 'Woozle', 'Arctic', 'Simagin', 'Meitner', 'Elbert', 'Dlugy', 'Toikkanen', 'Rooks', 'Swap', 'Cochrane-Shumov', 'Foltys', 'Nescafe', 'Frappe', 'Horsefly', 'Mosquito', 'Pollock', 'Steinitz-Rosenthal', 'Beefeater', 'Storm', 'Soller-Zilbermints', 'Svenonius', 'Corkscrew', 'Sosonko', 'Kingfisher', 'Snagglepuss', 'Drill', 'German', 'Baeuerle', 'Godes', 'Adorjan', 'Uhlmann', 'Hartlaub', 'Lutikov', 'Pyrenees', 'Speelsmet', 'Beginners', 'Nowokunski', 'Tumbleweed', 'Banzai-Leong', 'Vulture', 'Big', 'Clamp', 'Hort-Antoshin', 'Bongcloud', 'Stamma', 'Chandler', 'Hekili-Loa', 'Silberschmidt', 'Nimzo-Dutch', 'Marienbad', 'Bavarian', 'Rhamphorhynchus', 'Neo-Modern', 'Haiti', 'Kaulich', 'Liedmann', 'Pillsbury', 'Billockus-Johansen', 'Perrin', 'Stoe-Shumov', 'Foltys', 'Nescafe', 'Frappe', 'Horsefly', 'Mosquito', 'Pollock', 'Steinitz-Rosenthal', 'Beefeater', 'Storm', 'Soller-Zilbermints', 'Svenonius', 'Corkscrew', 'Sosonko', 'Kingfisher', 'Snagglepuss', 'Drill', 'German', 'Baeuerle', 'Godes', 'Adorjan', 'Uhlmann', 'Hartlaub', 'Lutikov', 'Pyrenees', 'Speelsmet', 'Beginners', 'Nowokunski', 'Tumbleweed', 'Banzai-Leong', 'Vulture', 'Big', 'Clamp', 'Hort-Antoshin', 'Bongcloud', 'Stamma', 'Chandler', 'Hekili-Loa', 'Silberschmidt', 'Nimzo-Dutch', 'Marienbad', 'Bavarian', 'Rhamphorhynchus', 'Neo-Modern', 'Haiti', 'Kaulich', 'Liedmann', 'Pillsbury', 'Billockus-Johansen', 'Perrin', 'StoBeginners', 'Nowokunski', 'Tumbleweed', 'Banzai-Leong', 'Vulture', 'Big', 'Clamp', 'Hort-Antoshin', 'Bongcloud', 'Stamma', 'Chandler', 'Hekili-Loa', 'Silberschmidt', 'Nimzo-Dutch', 'Marienbad', 'Bavarian', 'Rhamphorhynchus', 'Neo-Modern', 'Haiti', 'Kaulich', 'Liedmann', 'Pillsbury', 'Billockus-Johansen', 'Perrin', 'Storming', 'Suchting', 'Quaade', 'Edinburgh', 'Zurich', 'Tour', 'Medusa', 'Pozarek', 'Furman', 'Pomar', 'Rossolimo', 'Carrera', 'Ringelbach', 'Ritter', 'Brentano', 'Panov-Botvinnik', 'Velimirovic', 'Gaga', 'Hergert', 'Fyfe', 'Schuehler', 'Venezolana', 'Schwartz', 'Benoni-Staunton', 'Kitchener', 'Hevendehl', 'lHermet', 'England', 'Eastbourne', 'Picklepuss', 'Kudischewitsch', 'Massachusetts', 'Romford', 'Liebig', 'Westermann', 'Heyde', 'Lundin', 'San', 'Jorge', 'Shy', 'Schneider', 'Scorpion-Horus', 'Snail', 'Fuller', 'Hanneken', 'Amsterdam', 'Thomas', 'Laroche', 'Semmering', 'Keoni-Hiva', 'Gloria', 'Poli', 'Bardeleben', 'Double-Dutch', 'Anti-Noteboom', 'Batavo-Polish']
self.token_dict = {token: i for i, token in enumerate(self.token_list)}
lengths_dict = {}
for token in self.token_list:
lengths_dict[len(token)] = lengths_dict.get(len(token), 0) + 1
self.lengths = sorted(lengths_dict.keys(), reverse=True)
def encode_single(self, token):
"""Convert a token to its corresponding integer ID."""
token_id = self.token_dict.get(token, None)
if token_id == -1:
print(f"Error> no token_id for {token}")
return token_id
def decode_single(self, token_id):
"""Convert an integer ID back to its corresponding token."""
if 0 <= token_id < len(self.token_list):
return self.token_list[token_id]
print(f"Error> token_id out of range: {token_id}")
return ""
def encode(self, string):
"""
Convert a string into a list of integer IDs. Handles cases where tokens are consecutive without spaces.
"""
encoded_tokens = []
i = 0
while i < len(string):
match_found = False
for length in self.lengths:
token = string[i:i+length]
if token in self.token_dict:
encoded_token = self.encode_single(token)
if encoded_token:
encoded_tokens.append(encoded_token)
i += length
match_found = True
break
if not match_found: # If no token was found, move to the next character
print(f"Warning> No token found for {string[i]}")
i += 1
return encoded_tokens
def decode(self, token_ids):
"""
Convert a list of integer IDs to a string.
"""
tokens = [self.decode_single(token_id) for token_id in token_ids]
return ''.join(tokens)