-
Notifications
You must be signed in to change notification settings - Fork 195
/
Copy pathconfig_all.yaml
905 lines (893 loc) · 124 KB
/
config_all.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
# Process config example including:
# - all global arguments
# - all ops and their arguments
# global parameters
project_name: 'all' # project name for distinguish your configs
dataset_path: '/path/to/your/dataset' # path to your dataset directory or file with weights(0.0-1.0), 1.0 as default.
# accepted format: 'weight1(optional) dataset1-path weight2(optional) dataset2-path'
export_path: '/path/to/result/dataset.jsonl' # path to processed result dataset. Supported suffixes include ['jsonl', 'json', 'parquet']
export_shard_size: 0 # shard size of exported dataset in Byte. In default, it's 0, which means export the whole dataset into only one file. If it's set a positive number, the exported dataset will be split into several dataset shards, and the max size of each shard won't larger than the export_shard_size
export_in_parallel: false # whether to export the result dataset in parallel to a single file, which usually takes less time. It only works when export_shard_size is 0, and its default number of processes is the same as the argument np. **Notice**: If it's True, sometimes exporting in parallel might require much more time due to the IO blocking, especially for very large datasets. When this happens, False is a better choice, although it takes more time.
np: 4 # number of subprocess to process your dataset
text_keys: 'text' # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...
# Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
suffixes: [] # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
use_cache: true # whether to use the cache management of Hugging Face datasets. It might take up lots of disk space when using cache
ds_cache_dir: null # cache dir for Hugging Face datasets. In default, it\'s the same as the environment variable `HF_DATASETS_CACHE`, whose default value is usually "~/.cache/huggingface/datasets". If this argument is set to a valid path by users, it will override the default cache dir
open_monitor: true # Whether to open the monitor to trace resource utilization for each OP during data processing. It\'s True in default.
use_checkpoint: false # whether to use the checkpoint management to save the latest version of dataset to work dir when processing. Rerun the same config will reload the checkpoint and skip ops before it. Cache will be disabled when using checkpoint. If args of ops before the checkpoint are changed, all ops will be rerun from the beginning.
temp_dir: null # the path to the temp directory to store intermediate caches when cache is disabled, these cache files will be removed on-the-fly. In default, it's None, so the temp dir will be specified by system. NOTICE: you should be caution when setting this argument because it might cause unexpected program behaviors when this path is set to an unsafe directory.
open_tracer: false # whether to open the tracer to trace the changes during process. It might take more time when opening tracer
op_list_to_trace: [] # only ops in this list will be traced by tracer. If it's empty, all ops will be traced. Only available when tracer is opened.
trace_num: 10 # number of samples to show the differences between datasets before and after each op. Only available when tracer is opened.
op_fusion: false # whether to fuse operators that share the same intermediate variables automatically. Op fusion might reduce the memory requirements slightly but speed up the whole process.
fusion_strategy: 'probe' # OP fusion strategy. Support ['greedy', 'probe'] now. 'greedy' means keep the basic OP order and put the fused OP to the last of each fused OP group. 'probe' means Data-Juicer will probe the running speed for each OP at the beginning and reorder the OPs and fused OPs according to their probed speed (fast to slow). It's 'probe' in default.
cache_compress: null # the compression method of the cache file, which can be specified in ['gzip', 'zstd', 'lz4']. If this parameter is None, the cache file will not be compressed. We recommend you turn on this argument when your input dataset is larger than tens of GB and your disk space is not enough.
keep_stats_in_res_ds: false # whether to keep the computed stats in the result dataset. The intermediate fields to store the stats computed by Filters will be removed if it's False. It's False in default.
keep_hashes_in_res_ds: false # whether to keep the computed hashes in the result dataset. The intermediate fields to store the hashes computed by Deduplicators will be removed if it's False. It's False in default.
adaptive_batch_size: false # whether to use adaptive batch sizes for each OP according to the probed results. It's False in default.
# for multimodal data processing
image_key: 'images' # key name of field to store the list of sample image paths.
image_special_token: '<__dj__image>' # the special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
audio_key: 'audios' # key name of field to store the list of sample audio paths.
audio_special_token: '<__dj__audio>' # the special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.
video_key: 'videos' # key name of field to store the list of sample video paths.
video_special_token: '<__dj__video>' # the special token that represents a video in the text. In default, it's "<__dj__video>". You can specify your own special token according to your input dataset.
eoc_special_token: '<|__dj__eoc|>' # the special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.
# for distributed processing
executor_type: default # type of executor, support "default" or "ray" for now.
ray_address: auto # the address of the Ray cluster.
# only for data analysis
percentiles: [0.25, 0.5, 0.75] # percentiles to analyze the dataset distribution
export_original_dataset: false # whether to export the original dataset with stats. If you only need the stats of the dataset, setting it to false could speed up the exporting.
save_stats_in_one_file: false # whether to store all stats result into one file
# for sandbox or hpo
data_probe_algo: 'uniform' # sampling algorithm for dataset. Should be one of ["uniform", "frequency_specified_field_selector", "topk_specified_field_selector"]. It's "uniform" in default. Only used for dataset sampling.
data_probe_ratio: 1.0 # the sampling ratio to the original dataset size. It's 1.0 in default. Only used for dataset sampling.
hpo_config: null # path to a configuration file when using auto-HPO tool.
# process schedule: a list of several process operators with their arguments
process:
# Mapper ops. Most of these ops need no arguments.
- audio_ffmpeg_wrapped_mapper: # simple wrapper for FFmpeg audio filters
- calibrate_qa_mapper: # calibrate question-answer pairs based on reference text.
api_model: 'gpt-4o' # API model name.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the calibration task.
input_template: null # Template for building the model input.
reference_template: null # Template for formatting the reference text.
qa_pair_template: null # Template for formatting question-answer pairs.
output_pattern: null # Regular expression for parsing model output.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the model.
sampling_params: {} # Extra parameters passed to the API call.
- calibrate_query_mapper: # calibrate query in question-answer pairs based on reference text.
- calibrate_response_mapper: # calibrate response in question-answer pairs based on reference text.
- chinese_convert_mapper: # convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.
mode: 's2t' # choose the mode to convert Chinese: ['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t']
- clean_email_mapper: # remove emails from text.
- clean_html_mapper: # remove html formats form text.
- clean_ip_mapper: # remove ip addresses from text.
- clean_links_mapper: # remove web links from text.
- clean_copyright_mapper: # remove copyright comments.
- dialog_intent_detection_mapper: # Mapper to generate user's intent labels in dialog.
api_model: 'gpt-4o' # API model name.
intent_candidates: null # The output intent candidates. Use open-domai intent labels n if it is None.
max_round: 10 # The max num of round in the dialog to build the prompt.
labels_key: 'dialog_intent_labels' # The key name in the meta field to store the output labels. It is 'dialog_intent_labels' in default.
analysis_key: 'dialog_intent_labels_analysis' # The key name in the meta field to store the corresponding analysis. It is 'dialog_intent_labels_analysis' in default.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
query_template: null # Template for query part to build the input prompt.
response_template: null # Template for response part to build the input prompt.
candidate_template: null # Template for intent candidates to build the input prompt.
analysis_template: null # Template for analysis part to build the input prompt.
labels_template: null # Template for labels to build the input prompt.
analysis_pattern: null # Pattern to parse the return intent analysis.
labels_pattern: null # Pattern to parse the return intent labels.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- dialog_sentiment_detection_mapper: # Mapper to generate user's sentiment labels in dialog.
api_model: 'gpt-4o' # API model name.
sentiment_candidates: null # The output sentiment candidates. Use open-domai sentiment labels n if it is None.
max_round: 10 # The max num of round in the dialog to build the prompt.
labels_key: 'dialog_sentiment_labels' # The key name in the meta field to store the output labels. It is 'dialog_sentiment_labels' in default.
analysis_key: 'dialog_sentiment_labels_analysis' # The key name in the meta field to store the corresponding analysis. It is 'dialog_sentiment_labels_analysis' in default.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
query_template: null # Template for query part to build the input prompt.
response_template: null # Template for response part to build the input prompt.
candidate_template: null # Template for sentiment candidates to build the input prompt.
analysis_template: null # Template for analysis part to build the input prompt.
labels_template: null # Template for labels part to build the input prompt.
analysis_pattern: null # Pattern to parse the return sentiment analysis.
labels_pattern: null # Pattern to parse the return sentiment labels.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- dialog_sentiment_intensity_mapper: # Mapper to predict user's sentiment intensity (from -5 to 5 in default prompt) in dialog.
api_model: 'gpt-4o' # API model name.
max_round: 10 # The max num of round in the dialog to build the prompt.
intensities_key: null # The key name in the meta field to store the output sentiment intensities. It is 'dialog_sentiment_intensity' in default.
analysis_key: null # The key name in the meta field to store the corresponding analysis. It is 'dialog_sentiment_intensity_analysis' in default.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
query_template: null # Template for query part to build the input prompt.
response_template: null # Template for response part to build the input prompt.
analysis_template: null # Template for analysis part to build the input prompt.
intensity_template: null # Template for intensity part to build the input prompt.
analysis_pattern: null # Pattern to parse the return sentiment analysis.
intensity_pattern: null # Pattern to parse the return sentiment intensity.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- dialog_topic_detection_mapper: # Mapper to generate user's topic labels in dialog.
api_model: 'gpt-4o' # API model name.
topic_candidates: null # The output topic candidates. Use open-domai topic labels n if it is None.
max_round: 10 # The max num of round in the dialog to build the prompt.
labels_key: 'dialog_topic_labels' # The key name in the meta field to store the output labels. It is 'dialog_topic_labels' in default.
analysis_key: 'dialog_topic_labels_analysis' # The key name in the meta field to store the corresponding analysis. It is 'dialog_topic_labels_analysis' in default.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
query_template: null # Template for query part to build the input prompt.
response_template: null # Template for response part to build the input prompt.
candidate_template: null # Template for topic candidates to build the input prompt.
analysis_template: null # Template for analysis part to build the input prompt.
labels_template: null # Template for labels part to build the input prompt.
analysis_pattern: null # Pattern to parse the return topic analysis.
labels_pattern: null # Pattern to parse the return topic labels.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- expand_macro_mapper: # expand macro definitions in Latex text.
- extract_entity_attribute_mapper: # Extract attributes for given entities from the text.
api_model: 'gpt-4o' # API model name.
query_entities: ["孙悟空", "猪八戒"] # Entity list to be queried.
query_attributes: ["人物性格"] # Attribute list to be queried.
entity_key: 'entity' # The key name in the meta field to store the given main entity for attribute extraction.
entity_attribute_key: 'attribute' # The key name in the meta field to store the given attribute to be extracted.
attribute_desc_key: 'attribute_description' # The key name in the meta field to store the extracted attribute description.
support_text_key: 'support_text' # The key name in the meta field to store the attribute support text extracted from the raw text.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt_template: null # System prompt template for the task. Need to be specified by given entity and attribute.
input_template: null # Template for building the model input.
attr_pattern_template: null # Pattern for parsing the attribute from output. Need to be specified by given attribute.
demo_pattern: null # Pattern for parsing the demonstraction from output to support the attribute.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
drop_text: false # If drop the text in the output.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- extract_entity_relation_mapper: # Extract entities and relations in the text for knowledge graph.
api_model: 'gpt-4o' # API model name.
entity_types: ['person', 'organization', 'location'] # Pre-defined entity types for knowledge graph.
entity_key: 'entity' # The key name in the meta field to store the entities.
relation_key: 'relation' # The key name in the meta field to store the relations between entities.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
prompt_template: null # The template of input prompt.
tuple_delimiter: null # Delimiter to separate items in outputs.
record_delimiter: null # Delimiter to separate records in outputs.
completion_delimiter: null # To mark the end of the output.
max_gleaning: 1 # the extra max num to call LLM to glean entities and relations.
continue_prompt: null # the prompt for gleaning entities and relations.
if_loop_prompt: null # the prompt to determine whether to stop gleaning.
entity_pattern: null # Regular expression for parsing entity record.
relation_pattern: null # Regular expression for parsing relation record.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
drop_text: false # If drop the text in the output.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- extract_event_mapper: # Extract events and relevant characters in the text
api_model: 'gpt-4o' # API model name.
event_desc_key: 'event_description' # The key name in the meta field to store the event descriptions.
relevant_char_key: 'relevant_characters' # The key name in the meta field to store the relevant characters to the events.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
input_template: null # Template for building the model input.
output_pattern: null # Regular expression for parsing model output.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
drop_text: false # If drop the text in the output.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- extract_keyword_mapper: # Generate keywords for the text.
api_model: 'gpt-4o' # API model name.
keyword_key: 'keyword' # The key name in the meta field to store the keywords.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
prompt_template: null # The template of input prompt.
completion_delimiter: null # To mark the end of the output.
output_pattern: null # Regular expression for parsing keywords.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
drop_text: false # If drop the text in the output.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- extract_nickname_mapper: # Extract nickname relationship in the text.
api_model: 'gpt-4o' # API model name.
nickname_key: 'nickname' # The key name in the meta field to store the nickname relationship.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
input_template: null # Template for building the model input.
output_pattern: null # Regular expression for parsing model output.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
drop_text: false # If drop the text in the output.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- extract_support_text_mapper: # extract support sub text for a summary.
api_model: 'gpt-4o' # API model name.
summary_key: 'event_description' # The key name in the meta field to store the input summary. It's "event_description" in default.
support_text_key: 'support_text' # The key name in the meta field to store the output support text for the summary. It's "support_text" in default.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for the task.
input_template: null # Template for building the model input.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
drop_text: false # If drop the text in the output.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- fix_unicode_mapper: # fix unicode errors in text.
- generate_qa_from_examples_mapper: # mapper to generate question and answer pairs from examples.
hf_model: 'Qwen/Qwen2.5-7B-Instruct' # Model name on huggingface to generate question and answer pairs.
seed_file: 'demos/data/demo-dataset-chatml.jsonl' # Path to the seed file in chatml format.
example_num: 3 # The number of randomly selected seed examples.
similarity_threshold: 0.7 # the similarity score threshold between the generated samples and the seed examples. Range from 0 to 1. Samples with similarity score less than this threshold will be kept.
system_prompt: null # System prompt for guiding the generation task.
input_template: null # Template for building the input prompt.
example_template: null # Template for formatting each QA example.
qa_pair_template: null # Template for formatting a single QA pair within each example.
output_pattern: null # Regular expression pattern to extract questions and answers from model response.
enable_vllm: false # Whether to use vllm for inference acceleration.
model_params: {} # Parameters for initializing the model.
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
- generate_qa_from_text_mapper: # mapper to generate question and answer pairs from text.
hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # Model name on huggingface to generate question and answer pairs.
output_pattern: null # Regular expression pattern to extract questions and answers from model response.
enable_vllm: false # Whether to use vllm for inference acceleration.
model_params: {} # Parameters for initializing the model.
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
- image_blur_mapper: # mapper to blur images.
p: 0.2 # probability of the image being blured
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- image_captioning_from_gpt4v_mapper: # generate samples whose texts are generated based on gpt-4-visison and the image
mode: 'description' # mode of text generated from images, can be one of ['resoning', 'description', 'conversation', 'custom']
api_key: '' # the API key to authenticate the request
max_token: 500 # the maximum number of tokens to generate. Default is 500.
temperature: 1.0 # controls the randomness of the output (range from 0 to 1). Default is 0.
system_prompt: '' # a string prompt used to set the context of a conversation and provide global guidance or rules for the gpt4-vision so that it can generate responses in the expected way. If `mode` set to `custom`, the parameter will be used
user_prompt: '' # a string prompt to guide the generation of gpt4-vision for each samples. It's "" in default, which means no prompt provided
user_prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated text in the final datasets and the original text will be removed. It's True in default
any_or_all: 'any' # keep this sample with 'any' or 'all' strategy of all images. 'any': keep this sample if any images meet the condition. 'all': keep this sample only if all images meet the condition
- image_captioning_mapper: # generate captions for images to augment datasets
hf_img2seq: 'Salesforce/blip2-opt-2.7b' # model name on huggingface to generate caption
caption_num: 1 # how many candidate captions to generate for each image
keep_candidate_mode: 'random_any' # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
prompt: null # a string prompt to guide the generation of blip2 model for all samples globally. It's None in default, which means no prompt provided.
prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
mem_required: '16GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_diffusion_mapper: # generate images by diffusion model
hf_diffusion: 'CompVis/stable-diffusion-v1-4' # stable diffusion model name on huggingface to generate image
torch_dtype: 'fp32' # the floating point type used to load the diffusion model. Can be one of ['fp32', 'fp16', 'bf16']
revision: 'main' # The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier allowed by Git.
strength: 0.8 # parameter of stable diffusion model, indicates extent to transform the reference image. will ignore the input image if it equals to 1
guidance_scale: 7.5 # parameter of stable diffusion model, a higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality
aug_num: 1 # the number of images to generate
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated images in the final datasets and the original images will be removed. It's True in default.
caption_key: null # the key name of fields in samples to store captions for each images, the caption guide the diffusion model to produce what the image is
hf_img2seq: 'Salesforce/blip2-opt-2.7b' # model name on huggingface to generate caption if caption_key is null
mem_required: '8GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_face_blur_mapper: # blur faces detected in images
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- image_tagging_mapper: # Mapper to generate image tags.
tag_field_name: 'image_tags' # the field name to store the tags. It's "image_tags" in default.
mem_required: '9GB'
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated texts in the final datasets and the original texts will be removed. It's True in default.
delete_random_word: false # whether to open the augmentation method of deleting random words from the original texts. e.g. "I love LLM" --> "I LLM"
swap_random_word: false # whether to open the augmentation method of swapping random contiguous words in the original texts. e.g. "I love LLM" --> "Love I LLM"
spelling_error_word: false # whether to open the augmentation method of simulating the spelling error for words in the original texts. e.g. "I love LLM" --> "Ai love LLM"
split_random_word: false # whether to open the augmentation method of splitting words randomly with whitespaces in the original texts. e.g. "I love LLM" --> "I love LL M"
keyboard_error_char: false # whether to open the augmentation method of simulating the keyboard error for characters in the original texts. e.g. "I love LLM" --> "I ;ov4 LLM"
ocr_error_char: false # whether to open the augmentation method of simulating the OCR error for characters in the original texts. e.g. "I love LLM" --> "I 10ve LLM"
delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "I love LLM" --> "I oe LLM"
swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "I love LLM" --> "I ovle LLM"
insert_random_char: false # whether to open the augmentation method of inserting random characters into the original texts. e.g. "I love LLM" --> "I ^lKove LLM"
- nlpcda_zh_mapper: # simply augment texts in Chinese based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated texts in the final datasets and the original texts will be removed. It's True in default.
replace_similar_word: false # whether to open the augmentation method of replacing random words with their similar words in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这边一共有5种不同的数据增强方法"
replace_homophone_char: false # whether to open the augmentation method of replacing random characters with their homophones in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的濖据增强方法"
delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
replace_equivalent_num: false # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
- optimize_qa_mapper: # optimize question-answer pairs.
hf_model: 'Qwen/Qwen2.5-7B-Instruct' # model name on huggingface.
system_prompt: null # System prompt for guiding the optimization task.
input_template: null # Template for building the input for the model.
qa_pair_template: null # Template for formatting the question and answer pair.
output_pattern: null # Regular expression pattern to extract question and answer from model response.
enable_vllm: false # whether to use vllm for inference acceleration.
model_params: {} # Parameters for initializing the model.
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
- optimize_query_mapper: # optimize query in question-answer pairs.
- optimize_response_mapper: # optimize response in question-answer pairs.
- pair_preference_mapper: # construct paired preference samples.
api_model: 'gpt-4o' # API model name.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for guiding the generation task.
input_template: null # Template for building the model input.
output_pattern: null # Regular expression for parsing model output.
rejected_key: 'rejected_response' # The field name in the sample to store the generated rejected response.
reason_key: 'reason' # The field name in the sample to store the reason for generating the response.
try_num: 3 # The number of retries for the API call in case of response parsing failure.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call.
- punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations.
- python_file_mapper: # executing Python lambda function defined in a file.
file_path: '' # The path to the Python file containing the function to be executed.
function_name: 'process_single' # The name of the function defined in the file to be executed.
- python_lambda_mapper: # executing Python lambda function on data samples.
lambda_str: '' # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
batched: False # A boolean indicating whether to process input data in batches.
- query_intent_detection_mapper: # Mapper to predict user's Intent label in query.
hf_model: 'bespin-global/klue-roberta-small-3i4k-intent-classification' # Hugginface model ID to predict intent label.
zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en' # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
model_params: {} # model param for hf_model.
zh_to_en_model_params: {} # model param for zh_to_hf_model.
label_key: 'query_intent_label' # The key name in the meta field to store the output label. It is 'query_intent_label' in default.
score_key: 'query_intent_label_score' # The key name in the meta field to store the corresponding label score. It is 'query_intent_label_score' in default.
- query_sentiment_detection_mapper: # Mapper to predict user's sentiment label ('negative', 'neutral' and 'positive') in query.
hf_model: 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis' # Hugginface model ID to predict sentiment label.
zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en' # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
model_params: {} # model param for hf_model.
zh_to_en_model_params: {} # model param for zh_to_hf_model.
label_key: 'query_sentiment_label' # The key name in the meta field to store the output label. It is 'query_sentiment_label' in default.
score_key: 'query_sentiment_label_score' # The key name in the meta field to store the corresponding label score. It is 'query_sentiment_label_score' in default.
- query_topic_detection_mapper: # Mapper to predict user's topic label in query.
hf_model: 'dstefa/roberta-base_topic_classification_nyt_news' # Hugginface model ID to predict topic label.
zh_to_en_hf_model: 'Helsinki-NLP/opus-mt-zh-en' # Translation model from Chinese to English. If not None, translate the query from Chinese to English.
model_params: {} # model param for hf_model.
zh_to_en_model_params: {} # model param for zh_to_hf_model.
label_key: 'query_topic_label' # The key name in the meta field to store the output label. It is 'query_topic_label' in default.
score_key: 'query_topic_label_score' # The key name in the meta field to store the corresponding label score. It is 'query_topic_label_score' in default.
- relation_identity_mapper: # identify relation between two entity in the text.
api_model: 'gpt-4o' # API model name.
source_entity: '孙悟空' # The source entity of the relation to be dentified.
target_entity: '猪八戒' # The target entity of the relation to be identified.
output_key: 'role_relation' # The output key in the meta field in the samples. It is 'role_relation' in default.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt_template: null # System prompt template for the task. Need to specify by entity1 and entity2.
input_template: null # Template for building the model input.
output_pattern_template: null # Regular expression template for parsing model output.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
drop_text: false # If drop the text in the output.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- remove_bibliography_mapper: # remove bibliography from Latex text.
- remove_comments_mapper: # remove comments from Latex text, code, etc.
doc_type: tex # comment type you want to remove. Only support 'tex' for now.
inline: true # whether to remove inline comments
multiline: true # whether to remove multiline comments
- remove_header_mapper: # remove header texts from Latex text.
drop_no_head: true # whether to drop sample texts without headers
- remove_long_words_mapper: # remove much too long words from text.
min_len: 1 # the min word length to keep words.
max_len: 128 # the max word length to keep words.
- remove_non_chinese_character_mapper: # remove non-Chinese character in text samples.
keep_alphabet: true # whether to keep alphabet
keep_number: true # whether to keep number
keep_punc: true # whether to keep punctuation
- remove_repeat_sentences_mapper: # remove repeat sentences in text samples.
lowercase: false # whether to convert sample text to lower case
ignore_special_character: true # whether to ignore special characters when judging repeated sentences. Special characters are all characters except Chinese characters, letters and numbers
min_repeat_sentence_length: 2 # sentences shorter than this length will not be deduplicated. If ignore_special_character is set to True, then special characters are not included in this length
- remove_specific_chars_mapper: # remove characters specified by users
chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□' # a string or a list including those characters that need to be removed
- remove_table_text_mapper: # remove possible table texts from text.
min_col: 2 # the min num of columns in tables to remove
max_col: 20 # the max num of columns in tables to remove
- remove_words_with_incorrect_substrings_mapper: # remove words with incorrect substrings from text.
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
substrings: ['http', 'www', '.com', 'href', '//'] # incorrect substrings to remove
- sentence_split_mapper: # split text to multiple sentences and join them with '\n'
lang: 'en' # split text in what language
- text_chunk_mapper: # Split input text to chunks.
max_len: 2000 # Split text into multi texts with this max len if not None.
split_pattern: '\n\n' # Make sure split in this pattern if it is not None and force cut if the length exceeds max_len.
overlap_len: 200 # Overlap length of the split texts if not split in the split pattern.
tokenizer: 'gpt-4o' # The tokenizer name of Hugging Face tokenizers. The text length will be calculate as the token num if it is offerd. Otherwise, the text length equals to string length.
trust_remote_code: True # for loading huggingface model.
- video_captioning_from_audio_mapper: # caption a video according to its audio streams based on Qwen-Audio model
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only captioned sample in the final datasets and the original sample will be removed. It's True in default.
mem_required: '30GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_captioning_from_frames_mapper: # generate samples whose captions are generated based on an image-to-text model and sampled video frames. Captions from different frames will be concatenated to a single string.
hf_img2seq: 'Salesforce/blip2-opt-2.7b' # image-to-text model name on huggingface to generate caption
caption_num: 1 # how many candidate captions to generate for each video
keep_candidate_mode: 'random_any' # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
prompt: null # a string prompt to guide the generation of image-to-text model for all samples globally. It's None in default, which means no prompt provided.
prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
horizontal_flip: false # flip frame image horizontally (left to right).
vertical_flip: false # flip frame image vertically (top to bottom).
mem_required: '20GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_captioning_from_summarizer_mapper: # generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...)
hf_summarizer: 'mrm8488/flan-t5-large-finetuned-openai-summarize_from_feedback' # the summarizer model used to summarize texts generated by other methods.
consider_video_caption_from_video: true # whether to consider the video caption generated from video directly in the summarization process. Default: True.
consider_video_caption_from_audio: true # whether to consider the video caption generated from audio streams in the video in the summarization process. Default: True.
consider_video_caption_from_frames: true # whether to consider the video caption generated from sampled frames from the video in the summarization process. Default: True.
consider_video_tags_from_audio: true # whether to consider the video tags generated from audio streams in the video in the summarization process. Default: True.
consider_video_tags_from_frames: true # whether to consider the video tags generated from sampled frames from the video in the summarization process. Default: True.
vid_cap_from_vid_args: null # the arg dict for video captioning from video directly with keys are the arg names and values are the arg values. Default: None.
vid_cap_from_frm_args: null # the arg dict for video captioning from sampled frames from the video with keys are the arg names and values are the arg values. Default: None.
vid_tag_from_aud_args: null # the arg dict for video tagging from audio streams in the video with keys are the arg names and values are the arg values. Default: None.
vid_tag_from_frm_args: null # the arg dict for video tagging from sampled frames from the video with keys are the arg names and values are the arg values. Default: None.
keep_tag_num: 5 # max number N of tags from sampled frames to keep. Too many tags might bring negative influence to summarized text, so we consider to only keep the N most frequent tags. Default: 5.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only summarized captions in the final datasets and the original captions will be removed. It's True in default.
mem_required: '40GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_captioning_from_video_mapper: # generate captions by frame images extracted from video to augment datasets
hf_video_blip: 'kpyu/video-blip-opt-2.7b-ego4d' # video-blip model name on huggingface to generate caption
caption_num: 1 # how many candidate captions to generate for each video
keep_candidate_mode: 'random_any' # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
prompt: null # a string prompt to guide the generation of video-blip model for all samples globally. It's None in default, which means no prompt provided.
prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
horizontal_flip: false # flip frame image horizontally (left to right).
vertical_flip: false # flip frame image vertically (top to bottom).
mem_required: '20GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_extract_frames_mapper: # extract frames from video files according to specified methods
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
duration: 0 # The duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
frame_dir: None # Output directory to save extracted frames. If None, a default directory based on the video file path is used.
- video_face_blur_mapper: # blur faces detected in videos
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- video_ffmpeg_wrapped_mapper: # simple wrapper for FFmpeg video filters
- video_remove_watermark_mapper: # Remove the watermarks in videos given regions
roi_strings: ['0,0,0.1,0.1'] # a given list of regions the watermarks locate. The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".
roi_type: ratio # the roi string type. When the type is 'pixel', (x1, y1), (x2, y2) are the locations of pixels in the top left corner and the bottom right corner respectively. If the roi_type is 'ratio', the coordinates are normalized by wights and heights.
roi_key: null # the key name of fields in samples to store roi_strings for each sample. It's used for set different rois for different samples.
frame_num: 10 # the number of frames to be extracted uniformly from the video to detect the pixels of watermark.
min_frame_threshold: 7 # a coodination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
detection_method: pixel_value # the method to detect the pixels of watermark. If it is 'pixel_value', we consider the distribution of pixel value in each frame. If it is 'pixel_diversity', we will consider the pixel diversity in different frames.
- video_resize_aspect_ratio_mapper: # resize videos aspect ratios of videos (a fraction of width by height, r=w/h) to a specified range
min_ratio: 9/21 # the minimum aspect ratio to enforce videos with an aspect ratio below `min_ratio` will be resized to match this minimum ratio. The ratio should be provided as a string in the format "9:21" or "9/21".
max_ratio: 21/9 # the maximum aspect ratio to enforce videos with an aspect ratio above `max_ratio` will be resized to match this maximum ratio. The ratio should be provided as a string in the format "21:9" or "21/9".
strategy: increase # the resizing strategy to apply when adjusting the video dimensions. It can be either 'decrease' to reduce the dimension or 'increase' to enlarge it. Accepted values are ['decrease', 'increase'].
- video_resize_resolution_mapper: # map videos to ones with given resolution range
min_width: 640 # the min horizontal resolution (unit p), videos with width less than 'min_width' will be mapped to videos with equal or bigger width
max_width: 1280 # the max horizontal resolution (unit p), videos with width more than 'max_width' will be mapped to videos with equal of smaller width
min_height: 480 # the min vertical resolution (unit p), videos with height less than 'min_height' will be mapped to videos with equal or bigger height
max_height: 1080 # the max vertical resolution (unit p), videos with height more than 'max_height' will be mapped to videos with equal or smaller height
force_original_aspect_ratio: 'increase' # Enable decreasing or increasing output video width or height if necessary to keep the original aspect ratio
force_divisible_by: 4 # Ensures that both the output dimensions, width and height, are divisible by the given integer when used together with force_original_aspect_ratio
- video_split_by_duration_mapper: # Mapper to split video by duration.
split_duration: 10 # duration of each video split in seconds.
min_last_split_duration: 0.1 # the minimum allowable duration in seconds for the last video split. If the duration of the last split is less than this value, it will be discarded.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
- video_split_by_key_frame_mapper: # Mapper to split video by key frame.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
- video_split_by_scene_mapper: # split videos into scene clips
detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`]
threshold: 27.0 # threshold passed to the detector
min_scene_len: 15 # minimum length of any scene
show_progress: false # whether to show progress from scenedetect
- video_tagging_from_audio_mapper: # Mapper to generate video tags from audio streams extracted from the video.
hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593' # Huggingface model name for the audio classification model.
tag_field_name: 'video_audio_tags' # the field name to store the tags. It's "video_audio_tags" in default.
mem_required: '500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_tagging_from_frames_mapper: # Mapper to generate video tags from frames extracted from the video.
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
tag_field_name: 'video_frame_tags' # the key name in the meta field to store the tags. It's "video_frame_tags" in default.
mem_required: '9GB'
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.
# Filter ops
- alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range.
tokenization: false # whether to count the ratio of alphanumeric to the total number of tokens.
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.9 # the max ratio of filter range
- audio_duration_filter: # keep data samples whose audios' durations are within a specified range.
min_duration: 0 # the min audio duration of filter range (in seconds)
max_duration: 3600 # the max audio duration of filter range (in seconds)
any_or_all: any # keep this sample when any/all audios meet the filter condition
- audio_nmf_snr_filter: # keep data samples whose audios' SNRs (computed based on NMF) are within a specified range.
min_snr: 0 # the min audio SNR to keep samples in dB. It's 0 by default.
max_snr: 1000 # the max audio SNR to keep samples in dB. It's sys.maxsize by default.
nmf_iter_num: 500 # the max number of iterations to run NMF. It's 500 in default.
any_or_all: any # keep this sample when any/all audios meet the filter condition
- audio_size_filter: # keep data samples whose audios' sizes are within a specified range.
min_duration: "0" # the min audio size of filter range
max_duration: "1TB" # the max audio size of filter range
any_or_all: any # keep this sample when any/all audios meet the filter condition
- average_line_length_filter: # filter text with the average length of lines out of specific range.
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- character_repetition_filter: # filter text with the character repetition ratio out of specific range
rep_len: 10 # repetition length for char-level n-gram
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.5 # the max ratio of filter range
- flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value
lang: en # consider flagged words in what language
tokenization: false # whether to use model to tokenize documents
max_ratio: 0.0045 # the max ratio to filter text
flagged_words_dir: ./assets # directory to store flagged words dictionaries
use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
words_aug_group_sizes: [2] # the group size of words to augment
words_aug_join_char: "" # the join char between words to augment
- image_aesthetics_filter: # filter samples according to the aesthetics score of images.
hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor
min_score: 0.3 # the min aesthetics score of filter range
max_score: 1.0 # the max aesthetics score of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_aspect_ratio_filter: # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
min_ratio: 0.333 # the min aspect ratio of filter range
max_ratio: 3.0 # the max aspect ratio of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_face_count_filter: # filter samples according to the face count in images
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
min_face_count: 1 # the minimum number of faces required for samples.
max_face_count: 1 # the maximum number of faces required for samples.
- image_face_ratio_filter: # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one.
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
min_ratio: 0.0 # the min face area ratio of filter range
max_ratio: 0.4 # the max face area ratio of filter range
- image_nsfw_filter: # filter samples according to the nsfw scores of images in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
score_threshold: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_pair_similarity_filter: # filter samples according to the similarity score between the image pair.
hf_clip: 'openai/clip-vit-base-patch32' # model name of the CLIP model on huggingface
min_score: 0.1 # the min similarity score of filter range
max_score: 1.0 # the max similarity score of filter range
any_or_all: "any" # keep this sample when any/all images meet the filter condition
- image_shape_filter: # filter samples according to the widths and heights of images in them
min_width: 200 # the min width of width filter range
max_width: 5000 # the max width of width filter range
min_height: 200 # the min height of height filter range
max_height: 5000 # the max height of height filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_size_filter: # filter samples according to the size of images (in bytes) within them
min_size: "0" # the min size of filter range
max_size: "1TB" # the max size of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_text_matching_filter: # filter samples according to the matching score between image and text.
hf_blip: Salesforce/blip-itm-base-coco # name of used Hugging Face blip
min_score: 0.003 # the min matching score of filter range
max_score: 1.0 # the max matching score of filter range
horizontal_flip: false # flip image horizontally (left to right).
vertical_flip: false # flip image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_text_similarity_filter: # filter samples according to the similarity between image and text.
hf_clip: openai/clip-vit-base-patch32 # name of used Hugging Face clip
min_score: 0.1 # the min similarity of filter range
max_score: 1.0 # the max similarity of filter range
horizontal_flip: false # flip image horizontally (left to right).
vertical_flip: false # flip image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_watermark_filter: # filter samples according to the predicted watermark probabilities of images in them
hf_watermark_model: amrul-hzz/watermark_detector # Huggingface model name for watermark classification
prob_threshold: 0.8 # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
lang: en # keep text in what language
min_score: 0.8 # the min language scores to filter text
- maximum_line_length_filter: # filter text with the maximum length of lines out of specific range
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- perplexity_filter: # filter text with perplexity score out of specific range
lang: en # compute perplexity in what language
max_ppl: 1500 # the max perplexity score to filter text
- phrase_grounding_recall_filter: # filter samples according to the locating recall of phrases extracted from text in the images.
hf_owlvit: google/owlvit-base-patch32 # name of used Hugging Face Owl-ViT
min_recall: 0.1 # the min phrase grounding recall of filter range
max_recall: 1.0 # the max phrase grounding recall of filter range
horizontal_flip: false # flip image horizontally (left to right).
vertical_flip: false # flip image vertically (top to bottom).
iou_thr: 0.5 # the IoU threshold for NMS-like post-process
large_area_ratio_thr: 0.95 # the area ratio threshold for filtering out large predicted bboxes
conf_thr: 0.0 # the confidence score threshold for removing low-confidence bboxes
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- special_characters_filter: # filter text with special-char ratio out of specific range
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.25 # the max ratio of filter range
- specified_field_filter: # filter text with the specified field info out of specific range
field_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
target_value: [] # the range of specified field information corresponding to the samples that need to be retained
- specified_numeric_field_filter: # filter text with the specified numeric field info out of specific range
field_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
min_value: 0 # the min filter value in SpecifiedNumericField op
max_value: 10000 # the max filter value in SpecifiedNumericField op
- stopwords_filter: # filter text with stopword ratio smaller than a specific min value
lang: en # consider stopwords in what language
tokenization: false # whether to use model to tokenize documents
min_ratio: 0.3 # the min ratio to filter text
stopwords_dir: ./assets # directory to store stopwords dictionaries
use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
words_aug_group_sizes: [2] # the group size of words to augment
words_aug_join_char: "" # the join char between words to augment
- suffix_filter: # filter to keep samples with specified suffix.
suffixes: [] # the suffix of text that will be keep. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
- text_action_filter: # filter text according the number of action verb
lang: en # consider the words in what language
min_action_num: 1 # text will be filtered whose verbs less the min action number
- text_entity_dependency_filter: # filter text without non-independent entity nouns
lang: en # consider the words in what language
min_dependency_num: 1 # the min number of adjacent edges of a non-independent noun in dependency tree
any_or_all: any # keep this sample when any/all entity nouns are non-independent
- text_length_filter: # filter text with length out of specific range
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- token_num_filter: # filter text with total token number out of specific range
hf_tokenizer: EleutherAI/pythia-6.9b-deduped # name of used Hugging Face tokenizer
min_num: 10 # the min number of filter range
max_num: 10000 # the max number of filter range
- video_aesthetics_filter: # filter samples according to the aesthetics score of frame images extracted from videos.
hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor
min_score: 0.3 # the min aesthetics score of filter range
max_score: 1.0 # the max aesthetics score of filter range
frame_sampling_method: 'uniform' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframe", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "uniform" with frame_num=3, considering that the number of keyframes can be large while their difference is usually small in terms of their aesthetics.
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_aspect_ratio_filter: # filter samples according to the aspect ratios of videos (a fraction of width by height, r=w/h) in them
min_ratio: 9/21 # the minimum aspect ratio to keep samples, supported format is a string, such as "9:21" or "9/21".
max_ratio: 21/9 # the maximum aspect ratio to keep samples, supported format is a string, such as "21:9" or "21/9".
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_duration_filter: # Keep data samples whose videos' durations are within a specified range.
min_duration: 0 # the min video duration of filter range (in seconds)
max_duration: 10 # the max video duration of filter range (in seconds)
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_frames_text_similarity_filter: # keep samples those similarities between sampled video frame images and text within a specific range.
hf_clip: openai/clip-vit-base-patch32 # clip model name on huggingface to compute the similarity between frame image and text. It's kind of language-related. For example, for Chinese datasets, ChineseCLIP might be a better choice.
min_score: 0.1 # the min similarity to keep samples.
max_score: 1.0 # the max similarity to keep samples.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
horizontal_flip: false # flip frame image horizontally (left to right).
vertical_flip: false # flip frame image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple videos in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all videos meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_motion_score_filter: # Keep samples with video motion scores within a specific range.
min_score: 0.25 # the minimum motion score to keep samples
max_score: 10000.0 # the maximum motion score to keep samples
sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow
size: null # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w)
max_size: null # maximum allowed for the longer edge of resized frames
divisible: 1 # The number that the dimensions must be divisible by.
relative: false # whether to normalize the optical flow magnitude to [0, 1], relative to the frame's diagonal length
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_motion_score_raft_filter: # Keep samples with video motion scores (based on RAFT model) within a specific range.
min_score: 1.0 # the minimum motion score to keep samples
max_score: 10000.0 # the maximum motion score to keep samples
sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow
size: null # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w)
max_size: null # maximum allowed for the longer edge of resized frames
divisible: 8 # The number that the dimensions must be divisible by.
relative: false # whether to normalize the optical flow magnitude to [0, 1], relative to the frame's diagonal length
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_nsfw_filter: # filter samples according to the nsfw scores of videos in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
score_threshold: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_ocr_area_ratio_filter: # Keep data samples whose detected text area ratios for specified frames in the video are within a specified range.
min_area_ratio: 0 # the min ocr area ratio to keep samples. It's 0 by default.
max_area_ratio: 1.0 # the max ocr area ratio to keep samples. It's 1.0 by default.
frame_sample_num: 3 # the number of sampled frames to calculate the ocr area ratio. If it's 1, only middle frame will be selected. If it's 2, only the first and the last frames will be selected. If it's larger than 2, in addition to the first and the last frames, other frames will be sampled evenly within the video duration.
languages_to_detect: ['ch_sim', 'en'] # texts in which languages should be detected. Default: ['ch_sim', 'en']. Full language list can be found here: https://www.jaided.ai/easyocr/.
any_or_all: any # keep this sample with 'any' or 'all' strategy of all videos. 'any': keep this sample if any videos meet the condition. 'all': keep this sample only if all videos meet the condition.
- video_resolution_filter: # filter samples according to the resolution of videos in them
min_width: 1280 # the min resolution of horizontal resolution filter range (unit p)
max_width: 4096 # the max resolution of horizontal resolution filter range (unit p)
min_height: 480 # the min resolution of vertical resolution filter range (unit p)
max_height: 1080 # the max resolution of vertical resolution filter range (unit p)
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_watermark_filter: # filter samples according to the predicted watermark probabilities of videos in them
hf_watermark_model: amrul-hzz/watermark_detector # Huggingface model name for watermark classification
prob_threshold: 0.8 # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode for multiple sampled video frames to compute final predicted watermark probabilities of videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_tagging_from_frames_filter: # filter samples according to the tags of videos in them
tags: ['people'] # a tag list to shift the videos, total tags can be found in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt
contain: any # require the videos containing 'any' or 'all' given tags. When tags equal to [], 'all' keeps all samples, 'any' keeps no sample.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
tag_field_name: 'video_frame_tags' # the key name in the meta field to store the tags. It's "video_frame_tags" in default.
any_or_all: any # keep this sample when any/all videos meet the filter condition
mem_required: '9GB'
- words_num_filter: # filter text with number of words out of specific range
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
min_num: 10 # the min number of filter range
max_num: 10000 # the max number of filter range
- word_repetition_filter: # filter text with the word repetition ratio out of specific range
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
rep_len: 10 # repetition length for word-level n-gram
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.5 # the max ratio of filter range
# Deduplicator ops
- document_deduplicator: # deduplicate text samples using md5 hashing exact matching method
lowercase: false # whether to convert text to lower case
ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
- document_minhash_deduplicator: # deduplicate text samples using MinHash-LSH method
tokenization: space # tokenization method for text. One of [space, punctuation, character, sentencepiece]
window_size: 5 # window size of shingling
num_permutations: 256 # number of permutations in minhash computing
jaccard_threshold: 0.7 # the min jaccard similarity threshold in near-duplicate detection. When the jaccard similarity of two sample texts is >= this threshold, they are regarded as similar samples and this op will only keep one of them after deduplication
num_bands: null # number of bands in LSH. Default it's None, and it will be determined by an optimal params computation algorithm by minimize the weighted sum of probs of False Positives and False Negatives
num_rows_per_band: null # number of rows in each band in LSH. Default it's None, and it will be determined by an optimal params computation algorithm
lowercase: true # whether to convert text to lower case
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
tokenizer_model: null # path for the sentencepiece model, used for sentencepiece tokenization.
- document_simhash_deduplicator: # deduplicate text samples using SimHash-LSH method
tokenization: space # tokenization method for text. One of [space, punctuation, character]
window_size: 6 # window size of shingling
num_blocks: 6 # number of blocks in SimHash computing
hamming_distance: 4 # the max hamming distance to regard 2 samples as similar enough pair. Should be less than num_blocks always
lowercase: true # whether to convert text to lower case
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
- image_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of images between documents.
method: phash # hash method for image. One of [phash, dhash, whash, ahash]
consider_text: false # whether to consider text hash together with image hash when applying deduplication.
- video_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of videos between documents.
consider_text: false # whether to consider text hash together with video hash when applying deduplication.
- ray_video_deduplicator: # the simple video deduplicator that can run on multi-nodes using md5 hashing exact matching method
backend: 'ray_actor' # the backend for dedup, either 'ray_actor' or 'redis'
redis_address: 'redis://localhost:6379' # the address of redis server
- ray_image_deduplicator: # the simple image deduplicator that can deduplicate samples at document-level using exact matching of images between documents.
backend: 'ray_actor' # the backend for dedup, either 'ray_actor' or 'redis'
redis_address: 'redis://localhost:6379' # the address of redis server
method: phash # hash method for image. One of [phash, dhash, whash, ahash]
- ray_document_deduplicator: # the simple document deduplicator that can run on multi-nodes using md5 hashing exact matching method
backend: 'ray_actor' # the backend for dedup, either 'ray_actor' or 'redis'
redis_address: 'redis://localhost:6379' # the address of redis server
lowercase: false # whether to convert text to lower case
ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
- ray_bts_minhash_deduplicator: # the document deduplicator that can run on multi-nodes using minhashLSH algorithm
tokenization: space # tokenization method for text. One of [space, punctuation, character, sentencepiece]
window_size: 5 # window size of shingling
num_permutations: 256 # number of permutations in minhash computing
jaccard_threshold: 0.7 # the min jaccard similarity threshold in near-duplicate detection. When the jaccard similarity of two sample texts is >= this threshold, they are regarded as similar samples and this op will only keep one of them after deduplication
num_bands: null # number of bands in LSH. Default it's None, and it will be determined by an optimal params computation algorithm by minimize the weighted sum of probs of False Positives and False Negatives
num_rows_per_band: null # number of rows in each band in LSH. Default it's None, and it will be determined by an optimal params computation algorithm
lowercase: true # whether to convert text to lower case
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
tokenizer_model: null # path for the sentencepiece model, used for sentencepiece tokenization.
union_find_parallel_num: 'auto' # number of parallel workers for union-find algorithm. Default it's 'auto', and it will be determined by half of the number of CPUs.
union_threshold: 256 # threshold for minhash values group to perform union-find algorightm.
max_pending_edge_buffer_task: 20 # max number of pending edge buffer ray tasks.
num_edge_buffer_task_returns: 10 # number of edge buffer tasks for `ray.wait` to return.
max_pending_filter_tasks: 20 # max number of pending filter ray tasks.
num_filter_task_returns: 10 # number of filter tasks for `ray.wait` to return.
merge_batch_size: 1000 # batch size for BTS operations.
tmp_file_name: './outputs/ray-dedup-tmp/' # the temporary folder name for deduplication.
# Selector ops
- frequency_specified_field_selector: # selector to select samples based on the sorted frequency of specified field value
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top specified field value
topk: # number of selected top specified field value
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order
- random_selector: # selector to random select samples
select_ratio: # the ratio to be sampled
select_num: # the number to be sampled
- range_specified_field_selector: # selector to select a range of samples based on the sorted specified field value from smallest to largest.
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
lower_percentile: # the lower bound of the percentile to be sampled
upper_percentile: # the upper bound of the percentile to be sampled
lower_rank: # the lower rank of the percentile to be sampled
upper_rank: # the upper rank of the percentile to be sampled
- tags_specified_field_selector: # Selector to select samples based on the tags of specified field.
field_key: '__dj__meta__.query_sentiment_label' # the target keys corresponding to multi-level field information need to be separated by '.'
target_tags: ['happy', 'sad'] # Target tags to be select.
- topk_specified_field_selector: # selector to select top samples based on the sorted specified field
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top samples
topk: # number of selected top sample
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order
# Grouper ops.
- naive_grouper: # Group all samples to one batched sample.
- naive_reverse_grouper: # Split one batched sample to samples.
batch_meta_export_path: null # the path to export the batch meta. Just drop the batch meta if it is None.
- key_value_grouper: # Group samples to batched samples according values in given keys.
group_by_keys: null # Group samples according values in the keys. Support for nested keys such as "__dj__stats__.text_len". It is [self.text_key] in default.
# aggregator ops.
- entity_attribute_aggregator: # Return conclusion of the given entity's attribute from some docs.
api_model: 'gpt-4o' # API model name.
entity: '孙悟空' # The given entity.
attribute: '人物经历' # The given attribute.
input_key: 'event_description' # The input key in the meta field of the samples. It is "event_description" in default.
output_key: 'entity_attribute' # The output key in the aggregation field of the samples. It is "entity_attribute" in default.
word_limit: 100 # Prompt the output length.
max_token_num: null # The max token num of the total tokens of the sub documents. Without limitation if it is None.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt_template: null # System prompt template for the task. Need to be specified by given entity and attribute.
example_prompt: null # The example part in the system prompt.
input_template: null # The input template.
output_pattern_template: null # The output template.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- meta_tags_aggregator: # Merge similar meta tags to one tag.
api_model: 'gpt-4o' # API model name.
meta_tag_key: 'query_sentiment_label' # The key of the meta tag to be mapped. It is "query_sentiment_label" in default.
target_tags: ['开心', '难过', '其他'] # The tags that is supposed to be mapped to.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # The system prompt.
input_template: null # The input template.
target_tag_template: null # The tap template for target tags.
tag_template: null # The tap template for each tag and its frequency.
output_pattern: null # The output pattern.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- most_relavant_entities_aggregator: # Extract entities closely related to a given entity from some texts, and sort them in descending order of importance.
api_model: 'gpt-4o' # API model name.
entity: '孙悟空' # The given entity.
query_entity_type: '人物' # The type of queried relavant entities.
input_key: 'event_description' # The input key in the meta field of the samples. It is "event_description" in default.
output_key: 'most_relavant_entities' # The output key in the aggregation field of the samples. It is "most_relavant_entities" in default.
max_token_num: null # The max token num of the total tokens of the sub documents. Without limitation if it is None.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt_template: null # System prompt template for the task. Need to be specified by given entity and entity_type.
input_template: null # The input template.
output_pattern: null # The output pattern.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
- nested_aggregator: # Considering the limitation of input length, nested aggregate contents for each given number of samples.
api_model: 'gpt-4o' # API model name.
input_key: 'event_description' # The input key in the meta field of the samples. It is "event_description" in default.
output_key: null # The output key in the aggregation field in the samples. It is same as the input_key in default.
max_token_num: null # The max token num of the total tokens of the sub documents. Without limitation if it is None.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # The system prompt.
sub_doc_template: null # The template for input text in each sample.
input_template: null # The input template.
try_num: 3 # The number of retry attempts when there is an API call error or output parsing error.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}