-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.py
63 lines (56 loc) · 1.25 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pyarrow as pa
from fondant.pipeline import Pipeline, Resources
from config import BASE_PATH, CLIP_MODEL, OUTPUT_DIR
pipeline = Pipeline(
name="index-datacomp-small-12m",
base_path=BASE_PATH,
)
dataset = pipeline.read(
"load_from_hf_hub",
arguments={
"dataset_name": "mlfoundations/datacomp_small",
"index_column": "uid",
"n_rows_to_load": 100000, # Remove this to run on full dataset
},
produces={
"url": pa.string(),
},
)
dataset = dataset.apply(
"download_images",
arguments={
"n_connections": 25,
},
consumes={
"image_url": "url",
},
resources=Resources(
cpu_limit="32",
memory_limit="256G",
),
input_partition_rows=1000,
)
dataset = dataset.apply(
"embed_images",
arguments={
"model_id": CLIP_MODEL,
"batch_size": 32,
},
resources=Resources(
cpu_limit="8",
memory_limit="30",
accelerator_number=4,
accelerator_name="NVIDIA_TESLA_T4",
),
)
dataset.write(
"write_to_file",
arguments={
"path": f"{BASE_PATH}/{OUTPUT_DIR}",
"format": "parquet",
},
consumes={
"embedding": pa.list_(pa.float32()),
"url": pa.string()
},
)