-
Notifications
You must be signed in to change notification settings - Fork 2
/
script.py
1999 lines (1551 loc) · 102 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
os.environ["WANDB_MODE"] = "offline"
# os.environ["WANDB_DISABLED"] = "true"
import json
import math
import random
import shutil
import sys
import threading
import time
import traceback
from datetime import datetime
from pathlib import Path
import gradio as gr
import pandas as pd
import torch
import transformers
from functools import partial
from .custom_scheduler import FPSchedulerTrainer, FPNEFtuneTrainer
from .matplotgraph import create_graph
from .train_utils import get_available_loras_local, precise_cut, sliding_block_cut, download_file_from_url
from peft.tuners.lora import QuantLinear
import bitsandbytes as bnb
from datasets import Dataset, load_dataset, DatasetDict
from peft import (
LoraConfig,
get_peft_model,
prepare_model_for_kbit_training,
set_peft_model_state_dict
)
from peft.utils.other import \
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as model_to_lora_modules
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
import warnings
warnings.filterwarnings(action = "ignore", message="torch.utils.checkpoint:")
warnings.filterwarnings(action = "ignore", message="`do_sample` is set to `False`")
from modules import shared, utils
from modules.ui import create_refresh_button
from modules.evaluate import (
calculate_perplexity,
generate_markdown_table,
save_past_evaluations
)
from modules.logging_colors import logger
from modules.models import reload_model, unload_model, load_model
from modules.utils import natural_keys
params = {
"display_name": "Training PRO",
"is_tab": True
}
non_serialized_params = {
"debug_slicer": False,
"Lora_sortedByTime": False,
"stop_at_loss": 0,
"stop_at_epoch": 0,
"save_steps_under_loss": 0.0,
"save_checkpoint_now": False,
"training_loop": False,
"current_stability": 0,
"save_epochs": 0,
"checkpoint_offset": 0,
"epoch_offset":0,
"safe_serialization": False,
"dump_dataset": False,
"dump_dataset_remove_s": True,
}
mapped_prompts = 0
MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to", "precize_slicing_overlap", "add_eos_token_type", "save_steps_under_loss", "add_bos_token", "training_projection","sliding_window","warmup_ratio","grad_accumulation","neft_noise_alpha","group_by_length","eliminate_long_blocks","lora_target_linear", "stop_at_epoch","datasetJSONL", "eval_datasetJSONL", "eval_stepsJSONL","hybrid_training", "hybrid_data_ratio","hybrid_text_ratio"]
WANT_INTERRUPT = False
train_log = {}
train_template = {}
train_log_graph = []
train_choices = ["all","q-k-v-o","q-k-v","k-v-down","q-v"]
statistics = {
'loss': [],
'lr': [],
}
RED = "\033[91m"
YELLOW = "\033[93m"
GREEN = "\033[92m"
RESET = "\033[0m"
def ui():
with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
tmp = gr.State('')
with gr.Row():
with gr.Column():
# YY.MM.DD
gr.Markdown("`Ver: 24.07.02` This is enhanced version of QLora Training. [Maintained by FP](https://github.com/FartyPants/Training_PRO/tree/main)")
with gr.Row():
with gr.Column(scale=5):
with gr.Row():
copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=get_available_loras_local(non_serialized_params['Lora_sortedByTime']), elem_classes=['slim-dropdown'],allow_custom_value = True)
create_refresh_button(copy_from, lambda: None, lambda: {'choices': get_available_loras_local(non_serialized_params['Lora_sortedByTime'])}, 'refresh-button')
with gr.Column():
sort_byTime = gr.Checkbox(label='Sort list by Date', value=False, info='Sorts Loras by date created.', elem_classes=['no-background'])
with gr.Row():
with gr.Column(scale=5):
lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
with gr.Column():
always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])
with gr.Row():
with gr.Column():
lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='Also called dimension count. Higher values = larger file, more content control. Smaller values = smaller file, less control. Use 4 or 8 for style, 128 or 256 to teach, 1024+ for fine-detail on big data. More VRAM is needed for higher ranks.')
lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
batch_size = gr.Slider(visible= False, label='Batch Size', value=0, minimum=0, maximum=1024, step=4, info='Now Replaced with Gradient accumulation. Keeping it for sake of old saved data')
micro_batch_size = gr.Slider(label='True Batch Size', value=4, minimum=1, maximum=128, step=1, info='Specifies how many text blocks per step will be trained. The higher value, the better the concept of training will be, but it requires more GPU memory and it reduces speed.')
grad_accumulation = gr.Slider(label='Gradient Accumulation Steps', value=1, minimum=1, maximum=256, step=1, info="Virtually multiplies the Batch Size by averaging the learning over more than one step. VRAM friendly. Evens out loss fluctuations but can also degrade training fidelity.")
with gr.Column():
epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='In scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt', 'FP_low_epoch_annealing', 'FP_half_time_annealing','FP_raise_fall_creative'], info='Learning rate scheduler - defines how the learning rate changes over time. Custom schedulers: FP_low_epoch_annealing, FP_half_time_annealing, FP_raise_fall_creative (see README)', elem_classes=['slim-dropdown'])
with gr.Accordion(label='Checkpoints', open=True):
with gr.Row():
with gr.Column():
save_steps = gr.Number(label='Save every n steps', value=0, info='A checkpoint will be saved every n steps and at each Epoch boundary. (0 = OFF)')
with gr.Column():
save_steps_under_loss = gr.Slider(label='Save at 10% Loss change', value=1.8, minimum=0.0, maximum=3.0, step=0.1, info="Saves checkpoints at (or bellow) this loss and then each time loss falls by at least 10% This works independently from 'Save every n steps'")
with gr.Row():
save_chackpoint_now = gr.Button('Queue Checkpoint Now')
with gr.Accordion(label ='Stops (can be changed during training)',open = True):
with gr.Row():
with gr.Column():
stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='If non 0 the process will automatically stop once the desired loss value is reached.')
with gr.Column():
stop_at_epoch = gr.Slider(label='Stop at Epoch', minimum=0, maximum=20, step=1, value=0, info='If non 0 the process will stop early once the set epoch is reached.')
with gr.Accordion(label='Advanced Options', open=True):
with gr.Row():
with gr.Column():
warmup_steps = gr.Number(label='Warmup Steps', value=100, info='Number of max steps used for a linear warmup. Reduces early over-fitting by the first training blocks. Value has precedent over Warmup Ratio. Aligns to the closest multiple of graddient accumulation')
warmup_ratio = gr.Slider(label='Warmup Ratio', minimum=0.0, maximum=0.2, step=0.025, value=0.0, info='Ratio of total training steps that will be used for a linear warmup. It applies only if Warmup Step is 0.')
neft_noise_alpha = gr.Slider(label='NEFtune noise scale', minimum=0.0, maximum=15, step=1, value=0.0, info='Add noise to the training to improve generalization. [0 - OFF, Starting value to experiment: 5]')
training_projection = gr.Radio(value = train_choices[4], label='LLaMA Target Projections', info='Change the targets (LORA is typically q-v)', choices=train_choices)
lora_target_linear = gr.Checkbox(label='All Linear Targets', value=False, info='Use all linear targets in the model')
lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.', elem_classes=['slim-dropdown'])
with gr.Column():
train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
add_bos_token = gr.Checkbox(label='Add BOS token', value=True, info="Adds BOS token for each dataset item")
add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item")
add_eos_token_type = gr.Dropdown(label='EOS placement (Text file)', choices=['Every Block', 'Hard Cut Blocks Only'], value='Every Block', info='', allow_custom_value = False)
group_by_length = gr.Checkbox(label='Group Samples by Length', value=False, info='Group together samples of roughly the same length in the training dataset.')
eliminate_long_blocks = gr.Checkbox(label='Eliminate cutoff blocks', value=False, info='Instead of just trimming blocks at cutoff, eliminate them from dataset alltogether if they are too long.')
higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
# for future
#with gr.Accordion(label='Dynamic Scheduler', open = False):
# ds_min_epochs = gr.Number(label='Minimum Epochs', value='1', info='Minimum epochs that will be always performed before ramp down can be triggered')
# ds_max_epochs = gr.Number(label='Maximum Epochs (fallback)', value='50', info='Maximum Epochs before the training will bail out completely (should be a large number)')
# ds_loss_trigger = gr.Slider(label='Trigger Loss', minimum=0.0, maximum=2.8, step=0.1, value=1.6, info='Loss at which the ramp down schedule will be triggered')
# ds_loss_rolling_window = gr.Number(label='Loss rolling average', value='4', info='Calculate loss by averaging last x numbers to avoid jumps and noise')
# ds_epochs_to_ramp = gr.Slider(label='Ramp down ratio', minimum=0.0, maximum=2.0, step=0.1, value=1.00, info='How long the ramp down will last relative to ellapsed steps (before trigger)')
# gr.Markdown('These are settings for FP_dynamic_loss_trigger scheduler. The scheduler will do warm up, then hold constant untill a loss falls under Trigger Loss, then it will commence linear ramp down schedule and stop. The length of ramp down is set by Ramp down ratio where (ramp down steps) = ratio * (elapsed steps). (The time to completition shown will be very high untill ramp down is triggered.)')
with gr.Column():
with gr.Tab(label='JSON Dataset'):
with gr.Row():
with gr.Column():
with gr.Row():
dataset = gr.Dropdown(choices=get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The flexible dataset JSON file to use for training.', allow_custom_value=True, elem_classes=['slim-dropdown'])
create_refresh_button(dataset, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'json')}, 'refresh-button')
with gr.Row():
eval_dataset = gr.Dropdown(choices=get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', allow_custom_value=True, elem_classes=['slim-dropdown'])
create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'json')}, 'refresh-button')
with gr.Column():
with gr.Row():
format = gr.Dropdown(choices=get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the JSON dataset input.', elem_classes=['slim-dropdown'])
create_refresh_button(format, lambda: None, lambda: {'choices': get_datasets('training/formats', 'json')}, 'refresh-button')
with gr.Row():
eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
with gr.Tab(label='JSONL Dataset'):
with gr.Row():
with gr.Column():
with gr.Row():
datasetJSONL = gr.Dropdown(choices=get_datasets('training/datasets', 'jsonl'), value='None', label='JSONL Dataset', info='JSONL dataset file to use for training. See OpenAI documentation.', allow_custom_value=True, elem_classes=['slim-dropdown'])
create_refresh_button(datasetJSONL, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'jsonl')}, 'refresh-button')
with gr.Row():
eval_datasetJSONL = gr.Dropdown(choices=get_datasets('training/datasets', 'jsonl'), value='None', label='JSONL Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', allow_custom_value=True, elem_classes=['slim-dropdown'])
create_refresh_button(eval_datasetJSONL, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'jsonl')}, 'refresh-button')
with gr.Column():
with gr.Row():
gr.Markdown('The format will be chosen automatically from the chat template in tokenizer. If the tokenizer doesn\'t have chat template defined (legacy), select the correct template in the WebUI [Parameters - Instruction template]')
with gr.Row():
eval_stepsJSONL = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation JSONL dataset is given, test it every time this many steps pass.')
with gr.Tab(label="Text file"):
with gr.Row():
raw_text_file = gr.Dropdown(choices=get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The text file to use for training.', allow_custom_value=True, elem_classes=['slim-dropdown'])
create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'txt')}, 'refresh-button')
with gr.Row():
with gr.Column():
precize_slicing_overlap = gr.Checkbox(label='Add Overlapping blocks', value = True)
sliding_window = gr.Checkbox(label='DEMENTOR Long-form Learning by FP (Highly Experimental, use low epochs)', value = False, info='Deep Memorization Enforcement Through Overlapping and Repetition. (I named it, so shush). Special process for learning long-form text using low amount of epochs.')
#debug_slicer = gr.Checkbox(label='Dump sentencelist.json to logs', value = non_serialized_params['debug_slicer'], info='Debug Slicer')
with gr.Column():
hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a cut between logical blocks of text (ex. Ideas or Chapters). Helps prevent unwanted overlap between unrelated ideas.')
min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Text blocks that have less or equal characters than this number.')
with gr.Tab(label="Hybrid"):
hybrid_training = gr.Checkbox(label='Hybrid Training (Experimental)', value = False, info = 'Train using Raw text file AND JSON or JSONL dataset at the same time.')
with gr.Row():
hybrid_data_ratio = gr.Slider(value = 100, minimum=0, maximum=100,label='Percentage of Dataset used')
hybrid_text_ratio = gr.Slider(value = 100, minimum=0, maximum=100,label='Percentage of Text file used')
gr.Markdown('This is an experimental hybrid training using both instruct and non-instruct data at once. You need to select Raw Text file AND JSON or JSONL dataset.\n\nOptionally you can set a percentage of dataset / text to dial the correct model response.')
with gr.Tab(label="URL"):
with gr.Row():
with gr.Column():
download_file_url = gr.Textbox(label='Download JSON or txt file to datasets (or formats) folder', value='',info='The URL of a file to download. If on github, make sure you get url of the raw file (https://raw.githubusercontent.com/...). If huggin face, make sure the url has /resolve/ in it not /blob/')
with gr.Row():
download_check_overwrite = gr.Checkbox(label='Overwrite', value=False, info='Overwrite if file exist')
download_folder = gr.Radio(label="Destination", value='training/datasets', choices=['training/datasets', 'training/formats'], interactive=True)
download_button = gr.Button('Download')
download_status = gr.Textbox(label='Download Status', value='', interactive=False)
with gr.Tab(label="Tools"):
with gr.Row():
with gr.Column():
split_dataset_perc = gr.Number(label='Evaluation dataset split (percentage)', value=10, info='Splits JSON dataset into _train and _eval files by the split percentage. Make sure the JSON is selected in the Formatted Dataset tab first.')
split_dataset_do = gr.Button('Split dataset')
with gr.Column():
convert_system = gr.Textbox(label = 'Convert JSON to JSONL', info = 'Select JSON in JSON Dataset tab and add System Message:', value='You are a helpful AI assistant.', lines=2)
convert_do = gr.Button('Convert JSON to JSONL')
with gr.Row():
with gr.Column():
convert_system2 = gr.Textbox(label = 'Simple TXT to JSONL conversion', info = 'Select TXT in Text File tab. Each item in txt should be separated by at least 3 empty lines. Enter system message:', value='You are a helpful AI assistant.', lines=1)
convert_prompt2 = gr.Textbox(label = 'Prompt', info = 'Prompt that will be inserted for every item', value='Write me a limerick.', lines=1)
convert_do2 = gr.Button('Convert TXT to JSONL')
with gr.Column():
dump_dataset = gr.Checkbox(label='Dump Training Dataset', value=False, info='Just before training begins, decode and dump the entire dataset into JSON file in /logs/')
dump_dataset_remove_s = gr.Checkbox(label='Clean up dump dataset', value=True, info='Removes BOS and EOS form the dump dataset')
with gr.Row():
with gr.Column():
with gr.Row():
cutoff_len = gr.Slider(label='Maximum context length (Cutoff)', minimum=32, maximum=4096, value=256, step=32, info='The maximum length of a chunk (in tokens). Applies to both JSON dataset and text files. Higher values require much more VRAM.')
with gr.Row():
with gr.Column():
check_dataset_btn = gr.Button('Verify Dataset/Text File and suggest data entries')
check_dataset_txt = gr.Textbox(label='Dataset info', value='')
with gr.Row():
start_button = gr.Button("Start LoRA Training", variant='primary')
stop_button = gr.Button("Interrupt")
with gr.Accordion(label="Graph", open=True):
with gr.Row():
# show_actions_button = False - we use old gradio
plot_graph = gr.LinePlot(x="epoch", y="value", title="Loss Metrics", overlay_point=True, tooltip=["epoch", "value"], x_lim=[0, 1], y_lim=[0, 3.5], width=500, height=250)
output = gr.Markdown(value="Ready")
with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
with gr.Row():
with gr.Column():
models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.')
with gr.Row():
with gr.Column():
stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
with gr.Column():
max_length = gr.Number(label='max_length', precision=0, step=256, value=0, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
with gr.Row():
start_current_evaluation = gr.Button("Evaluate loaded model")
start_evaluation = gr.Button("Evaluate selected models")
stop_evaluation = gr.Button("Interrupt")
with gr.Column():
evaluation_log = gr.Markdown(value='')
evaluation_table = gr.Dataframe(value=generate_markdown_table(), interactive=True)
with gr.Row():
save_comments = gr.Button('Save comments', elem_classes="small-button")
refresh_table = gr.Button('Refresh the table', elem_classes="small-button")
# Training events
all_params = [lora_name, always_override, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, raw_text_file, higher_rank_limit, warmup_steps, optimizer, hard_cut_string, train_only_after, stop_at_loss, add_eos_token, min_chars, report_to, precize_slicing_overlap, add_eos_token_type, save_steps_under_loss, add_bos_token, training_projection,sliding_window,warmup_ratio,grad_accumulation, neft_noise_alpha,group_by_length,eliminate_long_blocks,lora_target_linear, stop_at_epoch, datasetJSONL, eval_datasetJSONL, eval_stepsJSONL, hybrid_training, hybrid_data_ratio, hybrid_text_ratio]
def fix_old_version(batch_size_val,micro_batch_size_val, grad_accumulation_val):
if batch_size_val>0:
gradient_acc = batch_size_val // micro_batch_size_val
print(f"Using Old version of Batch Size ({batch_size_val}) to set Gradient Accumulation: {gradient_acc}")
return gradient_acc
return grad_accumulation_val
copy_from.change(partial(do_copy_params, all_params= all_params), copy_from, all_params).then(fix_old_version,[batch_size,micro_batch_size, grad_accumulation],grad_accumulation)
start_button.click(do_train, all_params, [output,plot_graph])
stop_button.click(do_interrupt, None, None, queue=False)
higher_rank_limit.change(change_rank_limit, [higher_rank_limit], [lora_rank, lora_alpha])
def trigger_stop_at_loss(stop_at_loss_value):
non_serialized_params.update({"stop_at_loss": stop_at_loss_value})
if non_serialized_params['training_loop']:
print(f"Queue: [Stop at loss Change] to {stop_at_loss_value}")
def trigger_stop_at_epoch(stop_at_epoch_value):
non_serialized_params.update({"stop_at_epoch": stop_at_epoch_value})
if non_serialized_params['training_loop']:
print(f"Queue: [Stop at Epoch Change] to {stop_at_epoch_value}")
stop_at_loss.change(trigger_stop_at_loss, stop_at_loss, None)
stop_at_epoch.change(trigger_stop_at_epoch, stop_at_epoch, None)
def trigger_save_checkpoint():
non_serialized_params.update({"save_checkpoint_now": True})
if non_serialized_params['training_loop']:
print("Queue: [Save checkpoint] Checkpoint will be saved after the current step is finished.")
else:
print("Use during the training to save the checkpoint at any time.")
def update_button():
return gr.Button.update('[Checkpoint in Queue]', variant='stop', interactive=True)
def update_button2():
time.sleep(1.0)
return gr.Button.update('Queue Checkpoint Now', variant='secondary',interactive = True)
save_chackpoint_now.click(trigger_save_checkpoint, None, None).then(update_button, None,save_chackpoint_now).then(update_button2, None,save_chackpoint_now)
dataset_calc_params = [save_steps,micro_batch_size, epochs, cutoff_len, dataset, format, raw_text_file, warmup_steps, hard_cut_string, min_chars, precize_slicing_overlap,sliding_window,warmup_ratio,grad_accumulation, datasetJSONL, hybrid_training, hybrid_data_ratio, hybrid_text_ratio]
def check_dataset(save_steps:int, micro_batch_size: int, epochs: int, cutoff_len: int, dataset:str, format:str, raw_text_file:str, warmup_steps:int, hard_cut_string:str, min_chars:int, precize_slicing_overlap:bool,sliding_window:bool,warmup_ratio:float,grad_accumulation:int, datasetJSONL:str, hybrid_training:bool, hybrid_data_ratio:int, hybrid_text_ratio:int):
result = "Specify JSON dastaset or Text file"
total_blocks = 0
if shared.tokenizer is None:
yield "Tokenizer is not available. Please Load some Model first."
return
# hybrid training hybrid_training
raw_text_used = False
hybrid_text_train_data = None
max_length_tokens = 0
hybrid_total_text_blocks = 0
if hybrid_training == True:
print(f" === {RED}Hybrid Training{RESET} ===")
if raw_text_file not in ['None', '']:
if datasetJSONL not in ['None', '']:
print(f" - Raw text + JSONL")
elif dataset not in ['None', '']:
print(f" - Raw text + JSON")
else:
print(f" - {RED}Error:{RESET} for Hybrid training you need Raw text AND JSONL or JSON dataset")
yield "Missing dataset and raw file for hybrid training, cannot continue."
return
else:
print(f" - {RED}Error:{RESET} for Hybrid training you need JSONL or JSON dataset AND Raw text file.")
yield "Missing dataset and raw file for hybrid training, cannot continue."
return
if raw_text_file not in ['None', '']:
logger.info("Loading Text file...")
fullpath = clean_path('training/datasets', f'{raw_text_file}')
fullpath = Path(fullpath)
if fullpath.is_dir():
logger.info('Training path directory {}'.format(raw_text_file))
raw_text = ""
file_paths = sorted(fullpath.glob('*.txt'), key=lambda path: natural_keys(path.name))
for file_path in file_paths:
if file_path.is_file():
with file_path.open('r', encoding='utf-8') as file:
raw_text += file.read().replace('\r', '')
logger.info(f"Loaded training file: {file_path.name}")
else:
try:
with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
raw_text = file.read().replace('\r', '')
except:
yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your training/datasets folder"
return
if min_chars<0:
min_chars = 0
EOS_token_str = '</s>'
BOS_token_str = '<s>'
if hasattr(shared.tokenizer, 'bos_token'):
BOS_token_str = shared.tokenizer.bos_token
else:
print(f" - No {RED}BOS{RESET} token defined in tokenizer, using default")
if hasattr(shared.tokenizer, 'eos_token'):
EOS_token_str = shared.tokenizer.eos_token
else:
print(f" - No {RED}EOS{RESET} token defined in tokenizer, using default")
print(f"Tokenizer BOS token: {GREEN}{BOS_token_str}{RESET}, EOS token: {RED}{EOS_token_str}{RESET}")
# == New more precise slicing on sentence boundary ==
if sliding_window:
text_chunks = sliding_block_cut(raw_text, min_chars, False, cutoff_len, hard_cut_string,non_serialized_params['debug_slicer'],EOS_token_str,BOS_token_str)
else:
text_chunks = precise_cut(raw_text, precize_slicing_overlap, min_chars, False, cutoff_len, hard_cut_string,non_serialized_params['debug_slicer'],EOS_token_str,BOS_token_str)
total_blocks = len(text_chunks)
hybrid_total_text_blocks = total_blocks
if hybrid_training==False:
raw_text_used = True
max_length = 0
max_text = ''
for example in text_chunks:
if len(example) > max_length:
max_length = len(example)
max_text = example
input_ids = shared.tokenizer.encode(max_text, truncation=True, max_length=8192)
result = f"Text: ({raw_text_file}.txt) has {total_blocks} blocks (Block Size {cutoff_len} tokens)"
result += f"\nLongest Plain Text Block: {len(input_ids)+1}"
if hybrid_training == True:
num_text_to_keep = int(total_blocks * float(hybrid_text_ratio) / 100.0)
result += f"\nUsing {hybrid_text_ratio}% of text: ({num_text_to_keep}/{total_blocks}) blocks"
hybrid_total_text_blocks = num_text_to_keep
#no suggestion for plaintext as it is set by cutoff_len anyway
max_length_tokens = 0
del text_chunks
# datasets
if raw_text_used == False:
data = None
format_data: dict[str, str] = {}
format_text = ''
if datasetJSONL not in ['None', '']:
logger.info("Loading JSONL datasets...")
with open(clean_path('training/datasets', f'{datasetJSONL}.jsonl'), 'r', encoding='utf-8-sig') as dataFile:
loaded_JSONLdata = json.load(dataFile)
chat_template = shared.tokenizer.chat_template
format_text = "Template: [Embedded]"
if shared.tokenizer.chat_template is None or shared.tokenizer.chat_template =='':
print(f"{RED}Missing chat template in tokenizer. Using instruction_template instead{RESET}")
shared.tokenizer.chat_template = shared.persistent_interface_state['instruction_template_str']
format_text = "Template: [Missing] << using instruction template instead"
logger.info("Applying chat template")
data_list = [{"jsonl": shared.tokenizer.apply_chat_template(entry["messages"], tokenize=False, add_generation_prompt=False)} for entry in loaded_JSONLdata]
shared.tokenizer.chat_template = chat_template
data = DatasetDict()
data['train'] = Dataset.from_list(data_list)
format_data = {"jsonl": "%jsonl%"}
else:
if dataset in ['None', '']:
yield "Select dataset or text file."
return
if format in ['None', '']:
yield "Select format choice for dataset."
return
if shared.tokenizer.pad_token_id is None:
print("Missing pad ID - setting to 0")
shared.tokenizer.pad_token_id = 0
with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
format_data: dict[str, str] = json.load(formatFile)
format_text = f'Format: [JSON] {format}'
logger.info("Loading JSON datasets...")
data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
def generate_prompt(data_point: dict[str, str]):
for options, data in format_data.items():
if set(options.split(',')) == set(x[0] for x in data_point.items() if (type(x[1]) is str and len(x[1].strip()) > 0)):
for key, val in data_point.items():
if type(val) is str:
data = data.replace(f'%{key}%', val)
return data
raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
def tokenize_dummy(prompt):
input_ids = shared.tokenizer.encode(prompt, truncation=True, max_length=8192)
labels = [1] * len(input_ids)
input_ids = torch.tensor(input_ids)
pad_token_id = shared.tokenizer.pad_token_id
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": input_ids.ne(pad_token_id),
}
def generate_and_tokenize_prompt(data_point):
prompt = generate_prompt(data_point)
return tokenize_dummy(prompt)
data_keys = []
if data:
if 'train' in data: # Check if the 'train' split exists in the dataset
data_keys = list(data['train'][0].keys())
print("Data Keys:", data_keys)
else:
print("The dataset is empty.")
if shared.tokenizer.pad_token_id is None:
print("Missing pad ID - setting to 0")
shared.tokenizer.pad_token_id = 0
train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
total_blocks = train_data.num_rows
max_length = 0
second_max_length = 0
for example in train_data:
length = len(example['input_ids'])
if length > max_length:
second_max_length = max_length
max_length = length
elif length > second_max_length:
second_max_length = length
max_length_tokens = max_length
if hybrid_training:
result = result+'\n'
else:
result = ''
result += f"Dataset: ({dataset}.json) has {total_blocks} blocks @ length = {cutoff_len} tokens\nKeys: {data_keys} {format_text}"
result += f"\nLongest Data Block: {max_length_tokens} tokens. Second Longest Block: {second_max_length} tokens."
if hybrid_training == True:
num_data_to_keep = int(total_blocks * float(hybrid_data_ratio) / 100.0)
result += f"\nUsing {hybrid_data_ratio}% of dataset: ({num_data_to_keep}/{total_blocks}) blocks"
total_blocks = num_data_to_keep
#for options, data in format_data.items():
# format_keys = options.split(',')
# result += f"{format_keys}, "
#result = result.rstrip()
#result = result.rstrip(',')
if total_blocks>0:
if hybrid_training == True:
total_blocks = hybrid_total_text_blocks + total_blocks
result += f"\n[Total number of Hybrid blocks: {total_blocks}]"
number_ofSteps = int(math.ceil(total_blocks / micro_batch_size) * epochs)
num_stepsPer_epoch = int(math.ceil(number_ofSteps/epochs))
min_warm = math.ceil(100 / grad_accumulation)
warmup_steps_suggest = min(int(min_warm*grad_accumulation), int(math.ceil(number_ofSteps * 0.1)))
warmup_steps_suggest = min(warmup_steps_suggest,num_stepsPer_epoch)
save_each_n_min = int(math.ceil(number_ofSteps/10))
save_each_n_max = int(math.ceil(number_ofSteps/5))
gradient_accumulation_max = int(total_blocks)//micro_batch_size
result += f"\n[Batch Size: {micro_batch_size}, Epochs: {epochs}, Gradient Accumulation: {grad_accumulation}]\n"
result += f"Total number of steps: {number_ofSteps}\n"
result += f"Steps per each Epoch: {num_stepsPer_epoch}\n"
result += f"Suggestions:\n"
if max_length_tokens>0:
next_max_multiple = ((max_length_tokens + 31) // 32) * 32
result += f"Maximum context length: {next_max_multiple} (Current: {cutoff_len})\n"
result += f"Checkpoints: Save every {save_each_n_min} - {save_each_n_max} steps (Current: {int(save_steps)})\n"
result += f"Warmup steps: {warmup_steps_suggest} (Current: {int(warmup_steps)})"
if gradient_accumulation_max < grad_accumulation:
result += f"\n\nWARNING: Gradient Accumulation {grad_accumulation} is too high: It should be below {gradient_accumulation_max}"
result = result.strip()
yield result
return
check_dataset_btn.click(check_dataset, dataset_calc_params ,check_dataset_txt)
# Evaluation events. For some reason, the interrupt event
# doesn't work with the .then() syntax, so I write them one
# by one in this ugly but functional way.
ev = start_evaluation.click(calculate_perplexity, [models, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
start_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
start_current_evaluation.click(lambda: ['current model'], None, tmp)
ev_cur = start_current_evaluation.click(calculate_perplexity, [tmp, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
start_current_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
stop_evaluation.click(None, None, None, cancels=[ev, ev_cur], queue=False)
refresh_table.click(generate_markdown_table, None, evaluation_table, show_progress=True)
save_comments.click(
save_past_evaluations, evaluation_table, None).then(
lambda: "Comments saved.", None, evaluation_log, show_progress=False)
def reload_lora():
return gr.Dropdown.update(choices=get_available_loras_local(non_serialized_params['Lora_sortedByTime']))
# nonserialized items
sort_byTime.change(lambda x: non_serialized_params.update({"Lora_sortedByTime": x}), sort_byTime, None).then(reload_lora,None,copy_from)
#debug_slicer.change(lambda x: non_serialized_params.update({"debug_slicer": x}), debug_slicer, None)
def update_dataset():
return gr.update(choices=get_datasets('training/datasets', 'json')), gr.update(choices=get_datasets('training/datasets', 'txt'))
download_button.click(download_file_from_url, [download_file_url,download_check_overwrite,download_folder] , download_status).then(update_dataset,None,[dataset , raw_text_file])
def update_datasetJSON():
return gr.update(choices=get_datasets('training/datasets', 'json')), gr.update(choices=get_datasets('training/datasets', 'json'))
def split_dataset(dataset, split_dataset_perc):
if dataset == 'None' or dataset == '':
print("No dataset selected in Formatted Datasets")
return
# Load the original JSON data
logger.info("Splitting JSON datasets 90/10...")
dataset_json_new = f'{dataset}_train.json'
eval_json_new = f'{dataset}_eval.json'
dataset_json = f'{dataset}.json'
with open(clean_path('training/datasets', dataset_json), 'r', encoding='utf-8-sig') as f:
data = json.load(f)
# Define the split ratio (e.g., 80% for training, 20% for evaluation)
split_ratio = 1.0 - float(split_dataset_perc)/100.0
total_samples = len(data)
split_index = int(total_samples * split_ratio)
print(f" + training: {split_index} blocks")
print(f" + eval: {total_samples - split_index} blocks")
# Shuffle the data to ensure randomness
random.shuffle(data)
# Split the data into training and evaluation sets
train_data = data[:split_index]
eval_data = data[split_index:]
# Save the training data to a new JSON file
with open(clean_path('training/datasets', dataset_json_new), 'w', encoding='utf-8') as f:
json.dump(train_data, f, indent=2)
# Save the evaluation data to a new JSON file
with open(clean_path('training/datasets', eval_json_new), 'w', encoding='utf-8') as f:
json.dump(eval_data, f, indent=2)
def select_dataset(dataset):
dataset_json_new = f'{dataset}_train.json'
eval_json_new = f'{dataset}_eval.json'
path1 = clean_path('training/datasets', dataset_json_new)
path2 = clean_path('training/datasets', eval_json_new)
returnA = 'None'
returnB = 'None'
if Path(path1).is_file():
print(f"{dataset_json_new} file selected for training")
returnA = dataset_json_new.replace('.json', '')
if Path(path2).is_file():
print(f"{eval_json_new} file selected for evaluation")
returnB = eval_json_new.replace('.json', '')
return returnA, returnB
split_dataset_do.click(split_dataset,[dataset,split_dataset_perc],None).then(update_datasetJSON, None,[dataset, eval_dataset]).then(select_dataset, dataset,[dataset,eval_dataset])
def update_datasetJSONL():
return gr.update(choices=get_datasets('training/datasets', 'jsonl')),gr.update(choices=get_datasets('training/datasets', 'jsonl'))
def update_datasetJSON():
return gr.update(choices=get_datasets('training/datasets', 'json')),gr.update(choices=get_datasets('training/datasets', 'json'))
def convert_json_to_jsonl(dataset, system_text):
if dataset == 'None' or dataset == '':
print("No dataset selected in Formatted Datasets")
return
dataset_json_new = f'{dataset}.jsonl'
dataset_json = f'{dataset}.json'
with open(clean_path('training/datasets', dataset_json), 'r', encoding='utf-8-sig') as f:
data = json.load(f)
print(f"Converting {dataset_json}...")
converted_data = []
for entry in data:
if system_text == '':
converted_entry = {
"messages": [
{"role": "user", "content": entry["instruction"]},
{"role": "assistant", "content": entry["output"]}
]
}
else:
converted_entry = {
"messages": [
{"role": "system", "content": system_text},
{"role": "user", "content": entry["instruction"]},
{"role": "assistant", "content": entry["output"]}
]
}
converted_data.append(converted_entry)
print(f"Saving {dataset_json_new}")
with open(clean_path('training/datasets', dataset_json_new), 'w') as outfile:
json.dump(converted_data, outfile, indent=2)
def convert_text_to_jsonl(textfile, system_text, prompt):
if textfile == 'None' or textfile == '':
print("No plain text selected in tab Text file")
return
dataset_json_new = f'{textfile}.jsonl'
dataset_txt = f'{textfile}.txt'
with open(clean_path('training/datasets', dataset_txt), 'r', encoding='utf-8-sig') as f:
text = f.read().replace('\r', '')
text_list = text.split("\n\n\n")
print(f"Converting {dataset_txt}...")
converted_data = []
for entry in text_list:
entry = entry.strip()
if entry!='':
converted_entry = {
"messages": [
{"role": "system", "content": system_text},
{"role": "user", "content": prompt},
{"role": "assistant", "content": entry}
]
}
converted_data.append(converted_entry)
print(f"Saving {dataset_json_new}")
with open(clean_path('training/datasets', dataset_json_new), 'w') as outfile:
json.dump(converted_data, outfile, indent=2)
def select_datasetJSONL(dataset):
dataset_json_new = f'{dataset}.jsonl'
pathJSONL = clean_path('training/datasets', dataset_json_new)
returnA = 'None'
returnB = 'None'
if Path(pathJSONL).is_file():
print(f"{dataset_json_new} file selected for training")
returnB = dataset_json_new.replace('.jsonl', '')
return returnA, returnB
def select_datasetJSON(dataset):
dataset_json_new = f'{dataset}.json'
pathJSON = clean_path('training/datasets', dataset_json_new)
return_to_clear = 'None'
return_to_set = 'None'
if Path(pathJSON).is_file():
print(f"{dataset_json_new} file selected for training")
return_to_set = dataset_json_new.replace('.json', '')
return return_to_clear, return_to_set
convert_do.click(convert_json_to_jsonl,[dataset,convert_system],None).then(update_datasetJSONL, None,[datasetJSONL,eval_datasetJSONL]).then(select_datasetJSONL, dataset,[dataset,datasetJSONL])
convert_do2.click(convert_text_to_jsonl,[raw_text_file,convert_system2,convert_prompt2],None).then(update_datasetJSONL, None,[datasetJSONL,eval_datasetJSONL]).then(select_datasetJSONL, raw_text_file,[raw_text_file,datasetJSONL])
dump_dataset.change(lambda x: non_serialized_params.update({"dump_dataset": x}), dump_dataset, None)
dump_dataset_remove_s.change(lambda x: non_serialized_params.update({"dump_dataset_remove_s": x}), dump_dataset_remove_s, None)
def get_datasets(path: str, ext: str):
# include subdirectories for raw txt files to allow training from a subdirectory of txt files
if ext == "txt":
return ['None'] + sorted(set([k.stem for k in list(Path(path).glob('*.txt')) + list(Path(path).glob('*/')) if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
return ['None'] + sorted(set([k.stem for k in Path(path).glob(f'*.{ext}') if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
def do_interrupt():
global WANT_INTERRUPT
WANT_INTERRUPT = True
def reload_model_local():
try:
modelname = shared.model_name
unload_model()
shared.model_name = modelname
if shared.model_name != '':
shared.model, shared.tokenizer = load_model(shared.model_name, shared.args.loader)
if shared.model is not None:
print(f"Successfully reloaded `{shared.model_name}`.")
else:
print(f"Failed to reload `{shared.model_name}`.")
except:
exc = traceback.format_exc()
logger.error('Failed to load the model.')
print(exc)
def do_copy_params(lora_name: str, all_params):
if lora_name:
f_name = f"{shared.args.lora_dir}/{clean_path(None, lora_name)}/training_parameters.json"
if Path(f_name).is_file():
with open(f_name, 'r', encoding='utf-8') as format_file:
params: dict[str, str] = json.load(format_file)
else:
params = {}
else:
params = {}
result = list()
for i in range(0, len(PARAMETERS)):
key = PARAMETERS[i]
if key in params:
result.append(params[key])
else:
result.append(all_params[i])
return result
def change_rank_limit(use_higher_ranks: bool):
mult = 2 if use_higher_ranks else 1
return {"maximum": 1024 * mult, "__type__": "update"}, {"maximum": 2048 * mult, "__type__": "update"}
def clean_path(base_path: str, path: str):
"""Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
path = path.replace('\\', '/').replace('..', '_')
if base_path is None:
return path
return f'{Path(base_path).absolute()}/{path}'
def backup_adapter(input_folder):
# Get the creation date of the file adapter_model.bin
try:
adapter_file = Path(f"{input_folder}/adapter_model.bin")
if adapter_file.is_file():
logger.info("Backing up existing LoRA adapter...")
creation_date = datetime.fromtimestamp(adapter_file.stat().st_ctime)
creation_date_str = creation_date.strftime("Backup-%Y-%m-%d")
# Create the new subfolder
subfolder_path = Path(f"{input_folder}/{creation_date_str}")
subfolder_path.mkdir(parents=True, exist_ok=True)
# Check if the file already exists in the subfolder
backup_adapter_file = Path(f"{input_folder}/{creation_date_str}/adapter_model.bin")
if backup_adapter_file.is_file():
print(" - Backup already exists. Skipping backup process.")
return
# Copy existing files to the new subfolder
existing_files = Path(input_folder).iterdir()
for file in existing_files:
if file.is_file():
shutil.copy2(file, subfolder_path)
except Exception as e:
print("An error occurred in backup_adapter:", str(e))
def calc_trainable_parameters(model):
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
num_params = param.numel()
# if using DS Zero 3 and the weights are initialized empty
if num_params == 0 and hasattr(param, "ds_numel"):
num_params = param.ds_numel
all_param += num_params
if param.requires_grad:
trainable_params += num_params
return trainable_params, all_param
def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str, precize_slicing_overlap: bool, add_eos_token_type: str, save_steps_under_loss: float, add_bos_token: bool, training_projection: str,sliding_window:bool,warmup_ratio:float, grad_accumulation: int,neft_noise_alpha:float, group_by_length:bool,eliminate_long_blocks:bool,lora_target_linear:bool, stop_at_epoch: float, datasetJSONL:str, eval_datasetJSONL:str, eval_stepsJSONL:int, hybrid_training:bool, hybrid_data_ratio:int, hybrid_text_ratio:int):
if shared.args.monkey_patch:
from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
replace_peft_model_with_int4_lora_model
)
replace_peft_model_with_int4_lora_model()
global train_log_graph
global WANT_INTERRUPT
global mapped_prompts
mapped_prompts = 0
WANT_INTERRUPT = False
statistics['loss'] = []
statistics['loss'].append({'epoch': 0, 'value': 0})
zero_pd = pd.DataFrame(statistics['loss'])
# == Input validation / processing ==
yield "Preparing the input...", zero_pd
lora_file_path = clean_path(None, lora_name)
if lora_file_path.strip() == '':
yield "Missing or invalid LoRA file name input.", zero_pd
return
lora_file_path = f"{Path(shared.args.lora_dir)}/{lora_file_path}"
actual_lr = float(learning_rate)
model_type = type(shared.model).__name__
if model_type in MODEL_CLASSES:
model_id = MODEL_CLASSES[model_type]
else:
model_id = "llama"
if model_type == "PeftModelForCausalLM":
if len(shared.lora_names) > 0:
yield "You are trying to train a LoRA while you already have another LoRA loaded. This will work, but may have unexpected effects. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*", zero_pd
logger.warning("Training LoRA over top of another LoRA. May have unexpected effects.")
else:
yield "Model ID not matched due to LoRA loading. Consider reloading base model. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*", zero_pd
logger.warning("Model ID not matched due to LoRA loading. Consider reloading base model.")
else:
yield "LoRA training has only currently been validated for LLaMA, OPT, GPT-J, and GPT-NeoX models. Unexpected errors may follow. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*", zero_pd
logger.warning(f"LoRA training has only currently been validated for LLaMA, OPT, GPT-J, and GPT-NeoX models. (Found model type: {model_type})")
time.sleep(5)
if shared.args.loader == 'GPTQ-for-LLaMa' and not shared.args.monkey_patch:
yield "LoRA training with GPTQ-for-LLaMa requires loading with `--monkey-patch`", zero_pd