Skip to content

Commit

Permalink
fix missing field meta tag on ray mode
Browse files Browse the repository at this point in the history
  • Loading branch information
Cathy0908 committed Jan 10, 2025
1 parent 8679da7 commit f030f47
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions data_juicer/core/ray_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from data_juicer import cuda_device_count
from data_juicer.core.data import DJDataset
from data_juicer.ops import Deduplicator, Filter, Mapper
from data_juicer.ops.base_op import TAGGING_OPS
from data_juicer.utils.constant import Fields
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.process_utils import calculate_np
Expand Down Expand Up @@ -108,6 +109,18 @@ def _run_single_op(self, op):
op_proc = calculate_np(op._name, op.mem_required, op.cpu_required,
self.num_proc, op.use_cuda())
num_gpus = get_num_gpus(op, op_proc)

if op._name in TAGGING_OPS.modules and Fields.meta not in self.data.columns(
):

def process_batch_arrow(table: pyarrow.Table):
new_column_data = [{} for _ in range(len(table))]
new_talbe = table.append_column(Fields.meta, [new_column_data])
return new_talbe

self.data = self.data.map_batches(process_batch_arrow,
batch_format='pyarrow')

try:
batch_size = getattr(op, 'batch_size',
1) if op.is_batched_op() else 1
Expand Down

0 comments on commit f030f47

Please sign in to comment.