Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read benchmarks on v6e (read cache + parallel downloads) #2925

Draft
wants to merge 37 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
88d9c8f
File cache on memory
kislaykishore Nov 22, 2024
d4c8e6d
FIO TPU file cache on disk
kislaykishore Nov 22, 2024
1ccebe5
Remove memory limits
kislaykishore Dec 7, 2024
740fe1f
nec changes
anushka567 Dec 10, 2024
ea41001
fio workload also
anushka567 Dec 11, 2024
010cc49
adding node affinity
anushka567 Dec 12, 2024
11584fb
increasing num nodes in each pool to ensure parallel pod scheduling
anushka567 Dec 12, 2024
35926f5
making namespace configurable
anushka567 Dec 12, 2024
b4ecc90
tweaks for running paralley namespace/instanceid/nodepool
anushka567 Dec 12, 2024
6bc4d2c
fixes in log parsing
anushka567 Dec 12, 2024
b70ad35
buffer on mmemory
anushka567 Dec 12, 2024
eeac1ca
making script configurable
anushka567 Dec 13, 2024
ee9bd8b
making everything configurable
anushka567 Dec 13, 2024
9e4f70d
correcting namespace
anushka567 Dec 13, 2024
83792c3
correcting pod yaml
anushka567 Dec 13, 2024
701c004
fix
anushka567 Dec 13, 2024
993d1c2
workloads for parallel downloads
anushka567 Jan 21, 2025
1dea023
overall commands script
anushka567 Jan 21, 2025
b44b320
fio tester
anushka567 Jan 21, 2025
7d19a4e
values.yaml
anushka567 Jan 21, 2025
dd7c94b
run tests
anushka567 Jan 21, 2025
0298842
run gke tests
anushka567 Jan 21, 2025
9d2a55a
cmd update
anushka567 Jan 21, 2025
d7946b8
fix
anushka567 Jan 21, 2025
1fafe55
fix2
anushka567 Jan 21, 2025
0208b03
changes to pod yaml
anushka567 Jan 22, 2025
1993821
filecacheconfig option
anushka567 Jan 22, 2025
85f0a6c
metadatacachett to -1
anushka567 Jan 23, 2025
b65fd08
split workload
anushka567 Jan 23, 2025
5c8c4bb
boot disk size change
anushka567 Jan 23, 2025
10e99c6
ldap replace
anushka567 Jan 23, 2025
31d09a6
nmspc
anushka567 Jan 23, 2025
79fa63d
configure correct namespace
anushka567 Jan 23, 2025
af7cee9
setup before the run
anushka567 Jan 24, 2025
41384b6
inc pod active dedline
anushka567 Jan 24, 2025
ff0bf19
updated workload
anushka567 Jan 25, 2025
d76b5ea
updated workload 2
anushka567 Jan 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Num_nodes must be the number of ndoes in the nodepool which will be used.
cluster_name=$1
num_nodes=$2
nodepool=$3
vpc_network=$4

bootDiskSize=2tb
cache_location=memory
filecacheConfig=on
node_pool=$nodepool
instance_id=filecache${filecacheConfig}-buffer-${cache_location}-boot-${bootDiskSize}
namespace="${cache_location}-${bootDiskSize}"

env project_id=tpu-prod-env-large-adhoc \
project_number=716203006749\
zone=us-central2-b \
cluster_name=${cluster_name} \
machine_type=ct6e-standard-4t \
num_nodes=${num_nodes} \
use_custom_csi_driver=true \
src_dir=$HOME/gcsfuse-custom-csi/.. \
gcsfuse_branch=master \
gcsfuse_src_dir=.\
workload_config=./perfmetrics/scripts/testing_on_gke/examples/workload1.json \
output_dir=. perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh --debug $namespace $node_pool $instance_id $cache_location $vpc_network


env project_id=tpu-prod-env-large-adhoc \
project_number=716203006749\
zone=us-central2-b \
cluster_name=${cluster_name} \
machine_type=ct6e-standard-4t \
num_nodes=${num_nodes} \
use_custom_csi_driver=true \
src_dir=$HOME/gcsfuse-custom-csi/.. \
gcsfuse_branch=master \
gcsfuse_src_dir=.\
workload_config=./perfmetrics/scripts/testing_on_gke/examples/workload2.json \
output_dir=. perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh --debug $namespace $node_pool $instance_id $cache_location $vpc_network

# Output csvs are stored under ./fio/output_{instance_id}.csv




Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def createOutputScenariosFromDownloadedFiles(args: dict) -> dict:
"""

output = {}
for root, _, files in os.walk(_LOCAL_LOGS_LOCATION + "/" + args.instance_id):
for root, _, files in os.walk(os.path.expanduser(_LOCAL_LOGS_LOCATION + "/" + args.instance_id)):
print(f"Parsing directory {root} ...")
if files:
# If directory contains gcsfuse_mount_options file, then parse gcsfuse
Expand Down Expand Up @@ -254,8 +254,8 @@ def writeRecordsToCsvOutputFile(output: dict, output_file_path: str):
if scenario not in record_set["records"]:
print(f"{scenario} not in output so skipping")
continue

for i in range(len(record_set["records"]["local-ssd"])):
print(record_set)
for i in range(len(record_set["records"][scenario])):
r = record_set["records"][scenario][i]

try:
Expand Down Expand Up @@ -298,7 +298,7 @@ def writeRecordsToCsvOutputFile(output: dict, output_file_path: str):
if __name__ == "__main__":
args = parse_arguments()
ensure_directory_exists(_LOCAL_LOGS_LOCATION)

print("here parsing start\n")
dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads(
args.workload_config
)
Expand All @@ -309,7 +309,6 @@ def writeRecordsToCsvOutputFile(output: dict, output_file_path: str):
print("Mash is not installed, will skip parsing CPU and memory usage.")

output = createOutputScenariosFromDownloadedFiles(args)

output_file_path = args.output_file
# Create the parent directory of output_file_path if doesn't
# exist already.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def createHelmInstallCommands(
dlioWorkloads: set,
instanceId: str,
machineType: str,
nodePool: str,
bufferLocation : str,
) -> list:
"""Creates helm install commands for the given dlioWorkload objects."""
helm_commands = []
Expand All @@ -67,6 +69,8 @@ def createHelmInstallCommands(
f'--set dlio.recordLength={dlioWorkload.recordLength}',
f'--set dlio.batchSize={batchSize}',
f'--set instanceId={instanceId}',
f'--set nodePool={nodePool}',
f'--set bufferLocation={bufferLocation}',
(
'--set'
f' gcsfuse.mountOptions={escape_commas_in_string(dlioWorkload.gcsfuseMountOptions)}'
Expand All @@ -93,6 +97,8 @@ def main(args) -> None:
dlioWorkloads,
args.instance_id,
args.machine_type,
args.nodePool,
args.bufferLocation,
)
buckets = [dlioWorkload.bucket for dlioWorkload in dlioWorkloads]
role = 'roles/storage.objectUser'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,24 @@ apiVersion: v1
kind: Pod
metadata:
name: {{ .Values.podName }}
{{- if ne .Values.scenario "local-ssd" }}
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/memory-limit: "6Gi"
{{- end }}
gke-gcsfuse/memory-limit: "0"
spec:
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
node.kubernetes.io/instance-type: {{ .Values.nodeType }}
cloud.google.com/gke-tpu-topology: 2x2
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-nodepool: {{ .Values.nodePool }}
containers:
- name: dlio-tester
image: {{ .Values.image }}
ports:
- containerPort: 18200
hostPort: 11021
resources:
limits:
google.com/tpu: 4
cpu: {{ .Values.resourceLimits.cpu }}
memory: {{ .Values.resourceLimits.memory }}
requests:
Expand Down Expand Up @@ -105,37 +108,29 @@ spec:
- name: dshm
emptyDir:
medium: Memory
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
{{ if eq .Values.bufferLocation "bootDisk" }}
- name: gke-gcsfuse-buffer
emptyDir: {}
{{ else if eq .Values.bufferLocation "memory" }}
- name: gke-gcsfuse-buffer
emptyDir:
medium: Memory
{{ end }}
- name: gke-gcsfuse-tmp
emptyDir:
medium: Memory
- name: logging-vol
emptyDir: {}
- name: data-vol
{{- if eq .Values.scenario "local-ssd" }}
emptyDir: {}
{{- else if eq .Values.scenario "gcsfuse-generic" }}
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: {{ .Values.bucketName }}
mountOptions: "{{ .Values.gcsfuse.mountOptions }}"
{{- else if eq .Values.scenario "gcsfuse-file-cache" }}
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: {{ .Values.bucketName }}
metadataCacheTTLSeconds: "{{ .Values.gcsfuse.metadataCacheTTLSeconds }}"
metadataTypeCacheCapacity: "{{ .Values.gcsfuse.metadataTypeCacheCapacity }}"
metadataStatCacheCapacity: "{{ .Values.gcsfuse.metadataStatCacheCapacity }}"
fileCacheCapacity: "{{ .Values.gcsfuse.fileCacheCapacity }}"
fileCacheForRangeRead: "{{ .Values.gcsfuse.fileCacheForRangeRead }}"
gcsfuseLoggingSeverity: "debug"
mountOptions: implicit-dirs
{{- else }}
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: {{ .Values.bucketName }}
metadataCacheTTLSeconds: "{{ .Values.gcsfuse.metadataCacheTTLSeconds }}"
metadataTypeCacheCapacity: "{{ .Values.gcsfuse.metadataTypeCacheCapacity }}"
metadataStatCacheCapacity: "{{ .Values.gcsfuse.metadataStatCacheCapacity }}"
gcsfuseLoggingSeverity: "debug"
gcsfuseLoggingSeverity: "info"
mountOptions: implicit-dirs
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
image: jiaxun/dlio:v1.2.0
bucketName: gke-dlio-test-data
# scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests.
scenario: local-ssd
nodeType: n2-standard-96
scenario: gcsfuse-generic
nodeType: ct6e-standard-4t
instanceId: ldap-yyyymmdd-hhmmss
podName:
outputDirPrefix:
nodePool:
bufferLocation:

resourceLimits:
cpu: 0
Expand All @@ -43,6 +45,4 @@ gcsfuse:
metadataCacheTTLSeconds: "360000"
metadataStatCacheCapacity: "-1"
metadataTypeCacheCapacity: "-1"
fileCacheCapacity: "-1"
fileCacheForRangeRead: "true"
mountOptions: "implicit-dirs"
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,28 @@ apiVersion: v1
kind: Pod
metadata:
name: {{ .Values.podName }}
{{- if ne .Values.scenario "local-ssd" }}
namespace: {{ .Values.namespace }}
annotations:
gke-gcsfuse/volumes: "true"
{{- end }}
spec:
activeDeadlineSeconds: 50000
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
node.kubernetes.io/instance-type: {{ .Values.nodeType }}
cloud.google.com/gke-tpu-topology: 2x2
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-nodepool: {{ .Values.nodePool}}
hostNetwork: true
containers:
- name: fio-tester
image: {{ .Values.image }}
ports:
- containerPort: 11021
hostPort: 11021
securityContext: # for cache dropping in the benchmarking tests.
privileged: true
resources:
limits:
cpu: {{ .Values.resourceLimits.cpu }}
memory: {{ .Values.resourceLimits.memory }}
requests:
cpu: {{ .Values.resourceRequests.cpu }}
memory: {{ .Values.resourceRequests.memory }}
google.com/tpu: 4
command:
- "/bin/sh"
- "-c"
Expand All @@ -51,19 +52,6 @@ spec:
file_size={{ .Values.fio.fileSize }}
num_of_threads={{ .Values.fio.numThreads }}

{{ if eq .Values.scenario "local-ssd" }}
echo "Installing gsutil..."
apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
apt-get update && apt-get install -y google-cloud-cli

gsutil -m cp -R gs://{{ .Values.bucketName }}/* /data

echo "Sleeping 5 minutes to wait for Local SSD RAID to populate data."
sleep 300
{{ end }}

# We are building fio from source because of the issue: https://github.com/axboe/fio/issues/1668.
# The sed command below is to address internal bug b/309563824.
# As recorded in this bug, fio by-default supports
Expand Down Expand Up @@ -162,23 +150,23 @@ spec:
echo "fio job completed!"
volumeMounts:
- name: dshm
mountPath: /dev/shm
mountPath: /shm
- name: data-vol
mountPath: /data
volumes:
- name: gke-gcsfuse-cache
{{ if eq .Values.cacheLocation "boot" }}
emptyDir: {}
{{ else }}
emptyDir:
medium: Memory
{{ end }}
- name: gke-gcsfuse-buffer
emptyDir: {}
- name: dshm
emptyDir:
medium: Memory
- name: data-vol
{{- if eq .Values.scenario "local-ssd" }}
emptyDir: {}
{{- else if eq .Values.scenario "gcsfuse-generic" }}
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: {{ .Values.bucketName }}
mountOptions: "{{ .Values.gcsfuse.mountOptions }}"
{{- else if eq .Values.scenario "gcsfuse-file-cache" }}
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
Expand All @@ -188,16 +176,6 @@ spec:
metadataStatCacheCapacity: "{{ .Values.gcsfuse.metadataStatCacheCapacity }}"
fileCacheCapacity: "{{ .Values.gcsfuse.fileCacheCapacity }}"
fileCacheForRangeRead: "true"
gcsfuseLoggingSeverity: "debug"
mountOptions: implicit-dirs
{{- else }}
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: {{ .Values.bucketName }}
metadataCacheTTLSeconds: "{{ .Values.gcsfuse.metadataCacheTTLSeconds }}"
metadataTypeCacheCapacity: "{{ .Values.gcsfuse.metadataTypeCacheCapacity }}"
metadataStatCacheCapacity: "{{ .Values.gcsfuse.metadataStatCacheCapacity }}"
gcsfuseLoggingSeverity: "debug"
mountOptions: implicit-dirs
{{- end }}
gcsfuseLoggingSeverity: "info"
mountOptions: "{{ .Values.gcsfuse.mountOptions }}"
disableMetrics: "true"
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,16 @@
# Declare variables to be passed into your templates.

image: ubuntu:24.04
bucketName: gke-dlio-test-data
bucketName: anushkadhn-us-central2-b-bkt
# scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests.
scenario: local-ssd
nodeType: n2-standard-96
scenario: gcsfuse-generic
nodeType: ct6e-standard-4t
instanceId: ldap-yyyymmdd-hhmmss
podName:
outputDirPrefix:
nodePool:
cacheLocation:
namespace:

resourceLimits:
cpu: 0
Expand All @@ -41,7 +44,7 @@ fio:
numThreads: "50"

gcsfuse:
metadataCacheTTLSeconds: "6048000"
metadataCacheTTLSeconds: "-1"
metadataStatCacheCapacity: "-1"
metadataTypeCacheCapacity: "-1"
fileCacheCapacity: "-1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def createOutputScenariosFromDownloadedFiles(args: dict) -> dict:
"""

output = {}
for root, _, files in os.walk(_LOCAL_LOGS_LOCATION + "/" + args.instance_id):
for root, _, files in os.walk(os.path.expanduser(_LOCAL_LOGS_LOCATION + "/" + args.instance_id)):
print(f"Parsing directory {root} ...")

if not files:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def createHelmInstallCommands(
fioWorkloads: set,
instanceId: str,
machineType: str,
nodePool: str,
fileCacheLoc: str,
namespace: str
) -> list:
"""Creates helm install commands for the given fioWorkload objects."""
helm_commands = []
Expand Down Expand Up @@ -73,7 +76,10 @@ def createHelmInstallCommands(
f' gcsfuse.mountOptions={escape_commas_in_string(fioWorkload.gcsfuseMountOptions)}'
),
f'--set nodeType={machineType}',
f'--set namespace={namespace}',
f'--set podName={podName}',
f'--set nodePool={nodePool}',
f'--set cacheLocation={fileCacheLoc}',
f'--set outputDirPrefix={outputDirPrefix}',
f"--set resourceLimits.cpu={resourceLimits['cpu']}",
f"--set resourceLimits.memory={resourceLimits['memory']}",
Expand All @@ -94,6 +100,9 @@ def main(args) -> None:
fioWorkloads,
args.instance_id,
args.machine_type,
args.node_pool,
args.file_cache_location,
args.namespace
)
buckets = (fioWorkload.bucket for fioWorkload in fioWorkloads)
role = 'roles/storage.objectUser'
Expand Down
Loading