From 8d4651763dd320c910f8cbbe2141adba7acd730a Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Tue, 18 Jun 2024 15:34:46 -0700 Subject: [PATCH 01/16] add 2 GPU runs to ci pipeline --- .buildkite/pipeline.yml | 883 ++++++++++++++++++++-------------------- 1 file changed, 452 insertions(+), 431 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a38aeba8c..a48d6aca2 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -33,31 +33,31 @@ steps: - "julia --project -e 'using Pkg; Pkg.precompile()'" - "julia --project -e 'using Pkg; Pkg.status()'" - - echo "--- Instantiate sea breeze env" - - "julia --project=experiments/ClimaCore/sea_breeze -e 'using Pkg; Pkg.instantiate(;verbose=true)'" - - "julia --project=experiments/ClimaCore/sea_breeze -e 'using Pkg; Pkg.precompile()'" - - "julia --project=experiments/ClimaCore/sea_breeze -e 'using Pkg; Pkg.status()'" + # - echo "--- Instantiate sea breeze env" + # - "julia --project=experiments/ClimaCore/sea_breeze -e 'using Pkg; Pkg.instantiate(;verbose=true)'" + # - "julia --project=experiments/ClimaCore/sea_breeze -e 'using Pkg; Pkg.precompile()'" + # - "julia --project=experiments/ClimaCore/sea_breeze -e 'using Pkg; Pkg.status()'" - - echo "--- Instantiate climacore exp env" - - "julia --project=experiments/ClimaCore/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'" - - "julia --project=experiments/ClimaCore/ -e 'using Pkg; Pkg.precompile()'" - - "julia --project=experiments/ClimaCore/ -e 'using Pkg; Pkg.status()'" + # - echo "--- Instantiate climacore exp env" + # - "julia --project=experiments/ClimaCore/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'" + # - "julia --project=experiments/ClimaCore/ -e 'using Pkg; Pkg.precompile()'" + # - "julia --project=experiments/ClimaCore/ -e 'using Pkg; Pkg.status()'" - echo "--- Instantiate ClimaEarth env" - "julia --project=experiments/ClimaEarth/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'" - "julia --project=experiments/ClimaEarth/ -e 'using Pkg; Pkg.precompile()'" - "julia --project=experiments/ClimaEarth/ -e 'using Pkg; Pkg.status()'" - - echo "--- Instantiate perf env" - - "julia --project=perf/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'" - - "julia --project=perf/ -e 'using Pkg; Pkg.precompile()'" - - "julia --project=perf/ -e 'using Pkg; Pkg.status()'" + # - echo "--- Instantiate perf env" + # - "julia --project=perf/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'" + # - "julia --project=perf/ -e 'using Pkg; Pkg.precompile()'" + # - "julia --project=perf/ -e 'using Pkg; Pkg.status()'" - - echo "--- Instantiate test env" - - "julia --project=test/ -e 'using Pkg; Pkg.develop(path=\".\")'" - - "julia --project=test/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'" - - "julia --project=test/ -e 'using Pkg; Pkg.precompile()'" - - "julia --project=test/ -e 'using Pkg; Pkg.status()'" + # - echo "--- Instantiate test env" + # - "julia --project=test/ -e 'using Pkg; Pkg.develop(path=\".\")'" + # - "julia --project=test/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'" + # - "julia --project=test/ -e 'using Pkg; Pkg.precompile()'" + # - "julia --project=test/ -e 'using Pkg; Pkg.status()'" - echo "--- Download artifacts" - "julia --project=artifacts -e 'using Pkg; Pkg.instantiate(;verbose=true)'" @@ -76,364 +76,365 @@ steps: - wait - - group: "Unit Tests" - steps: - - label: "MPI Regridder unit tests" - key: "regridder_mpi_tests" - command: "srun julia --color=yes --project=test/ test/mpi_tests/regridder_mpi_tests.jl --config_file $CONFIG_PATH/regridder_mpi.yml" - timeout_in_minutes: 20 - env: - CLIMACOMMS_CONTEXT: "MPI" - NPROCS: 2 - agents: - slurm_ntasks: 2 - slurm_mem: 16GB - - - label: "MPI BCReader unit tests" - key: "bcreader_mpi_tests" - command: "srun julia --color=yes --project=test/ test/mpi_tests/bcreader_mpi_tests.jl --job_id bcreader_mpi" - timeout_in_minutes: 20 - env: - CLIMACOMMS_CONTEXT: "MPI" - agents: - slurm_ntasks: 2 - slurm_mem: 16GB - - - label: "MPI Checkpointer unit tests" - key: "checkpointer_mpi_tests" - command: "srun julia --color=yes --project=test/ test/mpi_tests/checkpointer_mpi_tests.jl --job_id checkpointer_mpi" - timeout_in_minutes: 20 - env: - CLIMACOMMS_CONTEXT: "MPI" - agents: - slurm_ntasks: 2 - slurm_mem: 16GB - - - label: "MPI Utilities unit tests" - key: "utilities_mpi_tests" - command: "srun julia --color=yes --project=test/ test/utilities_tests.jl --job_id utilities_mpi" - timeout_in_minutes: 20 - env: - CLIMACOMMS_CONTEXT: "MPI" - agents: - slurm_ntasks: 2 - slurm_mem: 16GB - - - label: "Perf flame graph diff tests" - command: "julia --color=yes --project=perf/ perf/flame_test.jl --job_id flame_perf_target" - timeout_in_minutes: 5 - agents: - slurm_mem: 16GB - - - group: "GPU: unit tests and global bucket" - steps: - - label: "GPU runtests" - command: "julia --color=yes --project=test/ test/runtests.jl" - env: - CLIMACOMMS_DEVICE: "CUDA" - agents: - slurm_ntasks: 1 - slurm_gres: "gpu:1" - - - group: "Integration Tests" - steps: - - # Drivers for release > - - # SLABPLANET - - # Slabplanet default: - # - this is the most lightweight example with conservation and visual checks, with CLI specification as follows - # - numerics: dt = dt_cpl = 200s, nelem = 4 - # - physics: bulk aerodynamic surface fluxes, gray radiation, idealized insolation, equil moisture model, 0-moment microphysics - # - input data: monotonous remapping (land mask, SST, SIC) - # - slurm: unthreaded, 1 ntask - # - diagnostics: check and plot energy conservation, output plots after 9 days - - label: "Slabplanet: default" - key: "slabplanet_default" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_default.yml --job_id slabplanet_default" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_default_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: dry, no radiation" - key: "slabplanet_dry_norad" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_dry_norad.yml --job_id slabplanet_dry_norad" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_dry_norad_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: default with Float32" - key: "slabplanet_ft32" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_ft32.yml --job_id slabplanet_ft32" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_ft32_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: partitioned turbulent fluxes" - key: "slabplanet_partitioned_fluxes" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_partitioned_fluxes.yml --job_id slabplanet_partitioned_fluxes" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_partitioned_fluxes_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: non-monotonous surface remap" - key: "slabplanet_nonmono" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_nonmono.yml --job_id slabplanet_nonmono" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_nonmono_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: albedo from static map" - key: "slabplanet_albedo_static_map" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id slabplanet_albedo_static_map" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_albedo_static_map_artifacts/total_energy*.png" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: albedo from temporal map" - key: "slabplanet_albedo_temporal_map" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id slabplanet_albedo_temporal_map" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_albedo_temporal_map_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: albedo from function" - key: "slabplanet_albedo_function" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id slabplanet_albedo_function" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_albedo_function_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: eisenman sea ice" - key: "slabplanet_eisenman" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_eisenman.yml --job_id slabplanet_eisenman" - artifact_paths: "experiments/ClimaEarth/output/slabplanet_eisenman/slabplanet_eisenman_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "Slabplanet: extra atmos diagnostics" - key: "slabplanet_atmos_diags" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_atmos_diags.yml --job_id slabplanet_atmos_diags" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_atmos_diags_artifacts/*" - agents: - slurm_mem: 20GB - - # AMIP - - # ... - - - # PERFORMANCE - - # slabplanet default: track unthreaded performance (alloc tests, flame graph, flame graph diff, build history) - - label: ":rocket: Slabplanet: default (unthreaded)" - key: "slabplanet_unthreaded" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/default_unthreaded.yml --job_id default_unthreaded" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/default_unthreaded_artifacts/*" - env: - FLAME_PLOT: "" - BUILD_HISTORY_HANDLE: "" - agents: - slurm_ntasks: 1 - slurm_mem: 20GB - - - label: ":rocket: Slabplanet: default (unthreaded) - flame graph and allocation tests" - command: "julia --color=yes --project=perf perf/flame.jl --config_file $PERF_CONFIG_PATH/perf_default_unthreaded.yml --job_id perf_default_unthreaded" - artifact_paths: "perf/output/perf_default_unthreaded/*" - agents: - slurm_mem: 20GB - - - label: ":rocket: Slabplanet: default (unthreaded) - flame graph diff" - command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_default_unthreaded.yml --job_id perf_diff_default_unthreaded" - artifact_paths: "perf/output/perf_diff_default_unthreaded/*" - agents: - slurm_mem: 20GB - - # < end Drivers for release - - # CLIMACORE EXPERIMENTS - - - label: "sea_breeze" - command: "julia --color=yes --project=experiments/ClimaCore/sea_breeze experiments/ClimaCore/sea_breeze/run.jl" - artifact_paths: "experiments/ClimaCore/sea_breeze/output/*" - agents: - slurm_mem: 20GB - - - label: "heat-diffusion" - command: "julia --color=yes --project=experiments/ClimaCore/ experiments/ClimaCore/heat-diffusion/run.jl" - artifact_paths: "experiments/ClimaCore/output/heat-diffusion_artifacts/*" - agents: - slurm_mem: 20GB - - # AMIP AND SLABPLANET EXPERIMENTS - - - label: "Moist earth with slab surface - default: monin gray no_sponge idealinsol freq_dt_cpl" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/default_mono.yml --job_id default_mono" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/default_mono_artifacts/total_energy*.png" - agents: - slurm_mem: 20GB - - - label: "Moist earth with slab surface - notmono: monin gray no_sponge idealinsol freq_dt_cpl notmono" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/default_notmono.yml --job_id default_notmono" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/default_notmono_artifacts/total_energy*.png" - agents: - slurm_mem: 20GB - - # - label: "Moist earth with slab surface - test: monin allsky sponge idealinsol infreq_dt_cpl" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --FLOAT_TYPE Float64 --coupled true --surface_setup PrescribedSurface --moist equil --vert_diff true --rad allskywithclear --rayleigh_sponge true --alpha_rayleigh_uh 0 --alpha_rayleigh_w 10 --energy_check true --mode_name slabplanet --t_end 10days --dt_save_to_sol 3600secs --dt_cpl 21600 --dt 200secs --dt_rad 6hours --mono_surface true --h_elem 4 --precip_model 0M --job_id target_params_in_slab_test1" # Unconverged SF (reproduced locally); works with 200s dt_cpl - # artifact_paths: "experiments/ClimaEarth/output/slabplanet/target_params_in_slab_test1_artifacts/total_energy*.png" - - - label: "Moist earth with slab surface - test: bulk allsky sponge realinsol infreq_dt_cpl" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/target_params_in_slab_test2.yml --job_id target_params_in_slab_test2" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/target_params_in_slab_test2_artifacts/total_energy*.png" - agents: - slurm_mem: 20GB - - - label: "Moist earth with slab surface - test: monin gray sponge realinsol infreq_dt_cpl" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/target_params_in_slab_test3.yml --job_id target_params_in_slab_test3" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/target_params_in_slab_test3_artifacts/total_energy*.png" - agents: - slurm_mem: 20GB - - # breaking: - # - label: "Moist earth with slab surface - monin allsky no_sponge idealinsol infreq_dt_cpl" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --coupled true --surface_setup PrescribedSurface --moist equil --vert_diff true --rad allskywithclear --rayleigh_sponge false --energy_check true --mode_name slabplanet --t_end 10days --dt_save_to_sol 3600secs --dt_cpl 21600 --dt 200secs --dt_rad 6hours --idealized_insolation true --mono_surface true --h_elem 4 --precip_model 0M --job_id target_params_in_slab1" - # artifact_paths: "experiments/ClimaEarth/output/slabplanet/target_params_in_slab1_artifacts/total_energy*.png" - - - label: "AMIP target: albedo from function" - key: "target_amip_albedo_function" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/target_amip_albedo_function.yml --job_id target_amip_albedo_function" - artifact_paths: "experiments/ClimaEarth/output/amip/target_amip_albedo_function_artifacts/*" - agents: - slurm_mem: 20GB - - - label: "AMIP - Float64 + hourly checkpoint" - key: "amip" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_single_ft64_hourly_checkpoints.yml --job_id coarse_single_ft64_hourly_checkpoints" - artifact_paths: "experiments/ClimaEarth/output/amip/coarse_single_ft64_hourly_checkpoints_artifacts/*" - env: - FLAME_PLOT: "" - BUILD_HISTORY_HANDLE: "" - agents: - slurm_ntasks: 1 - slurm_mem: 20GB - - - label: "AMIP - Float64 + hourly checkpoint + co2" - key: "coarse_single_ft64_hourly_checkpoints_co2" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_single_ft64_hourly_checkpoints_co2.yml --job_id coarse_single_ft64_hourly_checkpoints_co2" - artifact_paths: "experiments/ClimaEarth/output/amip/coarse_single_ft64_hourly_checkpoints_co2_artifacts/*" - env: - FLAME_PLOT: "" - BUILD_HISTORY_HANDLE: "" - agents: - slurm_ntasks: 1 - slurm_mem: 20GB - - - label: "AMIP - Float64 test" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_single_ft64.yml --job_id coarse_single_ft64" - artifact_paths: "experiments/ClimaEarth/output/amip/coarse_single_ft64_artifacts/*" - agents: - slurm_ntasks: 1 - slurm_mem: 20GB - - - label: "AMIP - Float32 test" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_single_ft32.yml --job_id coarse_single_ft32" - artifact_paths: "experiments/ClimaEarth/output/amip/coarse_single_ft32_artifacts/*" - agents: - slurm_ntasks: 1 - slurm_mem: 20GB - - - label: "MPI AMIP" - command: "srun julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_mpi_n2.yml --job_id coarse_mpi_n2" - artifact_paths: "experiments/ClimaEarth/output/amip/coarse_mpi_n2_artifacts/*" - timeout_in_minutes: 240 - env: - CLIMACOMMS_CONTEXT: "MPI" - agents: - slurm_ntasks: 2 - slurm_mem: 16GB - - - # short high-res performance test - - label: "Unthreaded AMIP FINE" # also reported by longruns with a flame graph - key: "unthreaded_amip_fine" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/target_amip_n1_shortrun.yml --job_id target_amip_n1_shortrun" - artifact_paths: "experiments/ClimaEarth/output/amip/target_amip_n1_shortrun_artifacts/*" - env: - BUILD_HISTORY_HANDLE: "" - agents: - slurm_mem: 20GB - - # PERFORMANCE RUNS: flame graphs + allocation tests - - - label: ":rocket: flame graph and allocation tests: perf_coarse_single_ft64" - command: "julia --color=yes --project=perf perf/flame.jl --config_file $PERF_CONFIG_PATH/perf_coarse_single_ft64.yml --job_id perf_coarse_single_ft64" - artifact_paths: "perf/output/perf_coarse_single_ft64/*" - agents: - slurm_mem: 20GB - - - label: ":rocket: performance: flame graph diff: perf_diff_coarse_single_ft64" - command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_coarse_single_ft64.yml --job_id perf_diff_coarse_single_ft64" - artifact_paths: "perf/output/perf_diff_coarse_single_ft64/*" - agents: - slurm_mem: 20GB - - - group: "Hierarchy tests (1d)" - steps: - - label: ":construction: Dry Held Suarez" - key: "dry_held_suarez" - command: - - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_dry_held_suarez.jl > experiments/ClimaEarth/run_dry_held_suarez_short.jl - - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_dry_held_suarez_short.jl" - artifact_paths: "dry_held_suarez/dry_held_suarez/clima_atmos/*" - agents: - slurm_mem: 20GB - - - label: ":construction: Moist Held Suarez" - key: "moist_held_suarez" - command: - - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_moist_held_suarez.jl > experiments/ClimaEarth/run_moist_held_suarez_short.jl - - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_moist_held_suarez_short.jl" - artifact_paths: "moist_held_suarez/moist_held_suarez/clima_atmos/*" - agents: - slurm_mem: 20GB - - - label: ":construction: Cloudless Aquaplanet" - key: "cloudless_aquaplanet" - command: - - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_cloudless_aquaplanet.jl > experiments/ClimaEarth/run_cloudless_aquaplanet_short.jl - - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_cloudless_aquaplanet_short.jl" - artifact_paths: "cloudless_aquaplanet/cloudless_aquaplanet/clima_atmos/*" - agents: - slurm_mem: 20GB - - - label: ":construction: Cloudy Aquaplanet" - key: "cloudy_aquaplanet" - command: - - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_cloudy_aquaplanet.jl > experiments/ClimaEarth/run_cloudy_aquaplanet_short.jl - - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_cloudy_aquaplanet_short.jl" - artifact_paths: "cloudy_aquaplanet/cloudy_aquaplanet/clima_atmos/*" - agents: - slurm_mem: 20GB - - - label: ":construction: Cloudy Slabplanet" - key: "cloudy_slabplanet" - command: - - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_cloudy_slabplanet.jl > experiments/ClimaEarth/run_cloudy_slabplanet_short.jl - - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_cloudy_slabplanet_short.jl" - artifact_paths: "cloudy_slabplanet/cloudy_slabplanet/clima_atmos/*" - agents: - slurm_mem: 20GB - - - wait - - label: ":construction: Hierarchy plots" - key: "hierarchy_plots" - command: - - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/hierarchy/climate_plots.jl" - artifact_paths: "paper_figs/*" - agents: - slurm_mem: 20GB + # - group: "Unit Tests" + # steps: + # - label: "MPI Regridder unit tests" + # key: "regridder_mpi_tests" + # command: "srun julia --color=yes --project=test/ test/mpi_tests/regridder_mpi_tests.jl --config_file $CONFIG_PATH/regridder_mpi.yml" + # timeout_in_minutes: 20 + # env: + # CLIMACORE_DISTRIBUTED: "MPI" + # NPROCS: 2 + # agents: + # slurm_ntasks: 2 + # slurm_mem: 16GB + + # - label: "MPI BCReader unit tests" + # key: "bcreader_mpi_tests" + # command: "srun julia --color=yes --project=test/ test/mpi_tests/bcreader_mpi_tests.jl --job_id bcreader_mpi" + # timeout_in_minutes: 20 + # env: + # CLIMACORE_DISTRIBUTED: "MPI" + # agents: + # slurm_ntasks: 2 + # slurm_mem: 16GB + + # - label: "MPI Checkpointer unit tests" + # key: "checkpointer_mpi_tests" + # command: "srun julia --color=yes --project=test/ test/mpi_tests/checkpointer_mpi_tests.jl --job_id checkpointer_mpi" + # timeout_in_minutes: 20 + # env: + # CLIMACORE_DISTRIBUTED: "MPI" + # agents: + # slurm_ntasks: 2 + # slurm_mem: 16GB + + # - label: "MPI Utilities unit tests" + # key: "utilities_mpi_tests" + # command: "srun julia --color=yes --project=test/ test/utilities_tests.jl --job_id utilities_mpi" + # timeout_in_minutes: 20 + # env: + # CLIMACORE_DISTRIBUTED: "MPI" + # agents: + # slurm_ntasks: 2 + # slurm_mem: 16GB + + # - label: "Perf flame graph diff tests" + # command: "julia --color=yes --project=perf/ perf/flame_test.jl --job_id flame_perf_target" + # timeout_in_minutes: 5 + # agents: + # slurm_mem: 16GB + + # - group: "GPU: unit tests and global bucket" + # steps: + # - label: "GPU runtests" + # command: "julia --color=yes --project=test/ test/runtests.jl" + # agents: + # slurm_ntasks: 1 + # slurm_gres: "gpu:1" + + # - group: "Integration Tests" + # steps: + + # # Drivers for release > + + # # SLABPLANET + + # # Slabplanet default: + # # - this is the most lightweight example with conservation and visual checks, with CLI specification as follows + # # - numerics: dt = dt_cpl = 200s, nelem = 4 + # # - physics: bulk aerodynamic surface fluxes, gray radiation, idealized insolation, equil moisture model, 0-moment microphysics + # # - input data: monotonous remapping (land mask, SST, SIC) + # # - slurm: unthreaded, 1 ntask + # # - diagnostics: check and plot energy conservation, output plots after 9 days + # - label: "Slabplanet: default" + # key: "slabplanet_default" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_default.yml --job_id slabplanet_default" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_default_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: dry, no radiation" + # key: "slabplanet_dry_norad" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_dry_norad.yml --job_id slabplanet_dry_norad" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_dry_norad_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: default with Float32" + # key: "slabplanet_ft32" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_ft32.yml --job_id slabplanet_ft32" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_ft32_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: partitioned turbulent fluxes" + # key: "slabplanet_partitioned_fluxes" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_partitioned_fluxes.yml --job_id slabplanet_partitioned_fluxes" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_partitioned_fluxes_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: non-monotonous surface remap" + # key: "slabplanet_nonmono" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_nonmono.yml --job_id slabplanet_nonmono" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_nonmono_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: albedo from static map" + # key: "slabplanet_albedo_static_map" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id slabplanet_albedo_static_map" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_albedo_static_map_artifacts/total_energy*.png" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: albedo from temporal map" + # key: "slabplanet_albedo_temporal_map" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id slabplanet_albedo_temporal_map" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_albedo_temporal_map_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: albedo from function" + # key: "slabplanet_albedo_function" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id slabplanet_albedo_function" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_albedo_function_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: eisenman sea ice" + # key: "slabplanet_eisenman" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_eisenman.yml --job_id slabplanet_eisenman" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet_eisenman/slabplanet_eisenman_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "Slabplanet: extra atmos diagnostics" + # key: "slabplanet_atmos_diags" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_atmos_diags.yml --job_id slabplanet_atmos_diags" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/slabplanet_atmos_diags_artifacts/*" + # agents: + # slurm_mem: 20GB + + # # AMIP + + # # ... + + + # # PERFORMANCE + + # # slabplanet default: track unthreaded performance (alloc tests, flame graph, flame graph diff, build history) + # - label: ":rocket: Slabplanet: default (unthreaded)" + # key: "slabplanet_unthreaded" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/default_unthreaded.yml --job_id default_unthreaded" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/default_unthreaded_artifacts/*" + # env: + # FLAME_PLOT: "" + # BUILD_HISTORY_HANDLE: "" + # agents: + # slurm_ntasks: 1 + # slurm_mem: 20GB + + # - label: ":rocket: Slabplanet: default (unthreaded) - flame graph and allocation tests" + # command: "julia --color=yes --project=perf perf/flame.jl --config_file $PERF_CONFIG_PATH/perf_default_unthreaded.yml --job_id perf_default_unthreaded" + # artifact_paths: "perf/output/perf_default_unthreaded/*" + # agents: + # slurm_mem: 20GB + + # - label: ":rocket: Slabplanet: default (unthreaded) - flame graph diff" + # command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_default_unthreaded.yml --job_id perf_diff_default_unthreaded" + # artifact_paths: "perf/output/perf_diff_default_unthreaded/*" + # agents: + # slurm_mem: 20GB + + # # < end Drivers for release + + # # CLIMACORE EXPERIMENTS + + # - label: "sea_breeze" + # command: "julia --color=yes --project=experiments/ClimaCore/sea_breeze experiments/ClimaCore/sea_breeze/run.jl" + # artifact_paths: "experiments/ClimaCore/sea_breeze/output/*" + # agents: + # slurm_mem: 20GB + + # - label: "heat-diffusion" + # command: "julia --color=yes --project=experiments/ClimaCore/ experiments/ClimaCore/heat-diffusion/run.jl" + # artifact_paths: "experiments/ClimaCore/output/heat-diffusion_artifacts/*" + # agents: + # slurm_mem: 20GB + + # # AMIP AND SLABPLANET EXPERIMENTS + + # - label: "Moist earth with slab surface - default: monin gray no_sponge idealinsol freq_dt_cpl" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/default_mono.yml --job_id default_mono" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/default_mono_artifacts/total_energy*.png" + # agents: + # slurm_mem: 20GB + + # - label: "Moist earth with slab surface - notmono: monin gray no_sponge idealinsol freq_dt_cpl notmono" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/default_notmono.yml --job_id default_notmono" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/default_notmono_artifacts/total_energy*.png" + # agents: + # slurm_mem: 20GB + + # # - label: "Moist earth with slab surface - test: monin allsky sponge idealinsol infreq_dt_cpl" + # # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --FLOAT_TYPE Float64 --coupled true --surface_setup PrescribedSurface --moist equil --vert_diff true --rad allskywithclear --rayleigh_sponge true --alpha_rayleigh_uh 0 --alpha_rayleigh_w 10 --energy_check true --mode_name slabplanet --t_end 10days --dt_save_to_sol 3600secs --dt_cpl 21600 --dt 200secs --dt_rad 6hours --mono_surface true --h_elem 4 --precip_model 0M --job_id target_params_in_slab_test1" # Unconverged SF (reproduced locally); works with 200s dt_cpl + # # artifact_paths: "experiments/ClimaEarth/output/slabplanet/target_params_in_slab_test1_artifacts/total_energy*.png" + + # - label: "Moist earth with slab surface - test: bulk allsky sponge realinsol infreq_dt_cpl" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/target_params_in_slab_test2.yml --job_id target_params_in_slab_test2" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/target_params_in_slab_test2_artifacts/total_energy*.png" + # agents: + # slurm_mem: 20GB + + # - label: "Moist earth with slab surface - test: monin gray sponge realinsol infreq_dt_cpl" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/target_params_in_slab_test3.yml --job_id target_params_in_slab_test3" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/target_params_in_slab_test3_artifacts/total_energy*.png" + # agents: + # slurm_mem: 20GB + + # # breaking: + # # - label: "Moist earth with slab surface - monin allsky no_sponge idealinsol infreq_dt_cpl" + # # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --coupled true --surface_setup PrescribedSurface --moist equil --vert_diff true --rad allskywithclear --rayleigh_sponge false --energy_check true --mode_name slabplanet --t_end 10days --dt_save_to_sol 3600secs --dt_cpl 21600 --dt 200secs --dt_rad 6hours --idealized_insolation true --mono_surface true --h_elem 4 --precip_model 0M --job_id target_params_in_slab1" + # # artifact_paths: "experiments/ClimaEarth/output/slabplanet/target_params_in_slab1_artifacts/total_energy*.png" + + # - label: "AMIP target: albedo from function" + # key: "target_amip_albedo_function" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/target_amip_albedo_function.yml --job_id target_amip_albedo_function" + # artifact_paths: "experiments/ClimaEarth/output/amip/target_amip_albedo_function_artifacts/*" + # agents: + # slurm_mem: 20GB + + # - label: "AMIP - Float64 + hourly checkpoint" + # key: "amip" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_single_ft64_hourly_checkpoints.yml --job_id coarse_single_ft64_hourly_checkpoints" + # artifact_paths: "experiments/ClimaEarth/output/amip/coarse_single_ft64_hourly_checkpoints_artifacts/*" + # env: + # FLAME_PLOT: "" + # BUILD_HISTORY_HANDLE: "" + # agents: + # slurm_ntasks: 1 + # slurm_mem: 20GB + + # - label: "AMIP - Float64 + hourly checkpoint + co2" + # key: "coarse_single_ft64_hourly_checkpoints_co2" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_single_ft64_hourly_checkpoints_co2.yml --job_id coarse_single_ft64_hourly_checkpoints_co2" + # artifact_paths: "experiments/ClimaEarth/output/amip/coarse_single_ft64_hourly_checkpoints_co2_artifacts/*" + # env: + # FLAME_PLOT: "" + # BUILD_HISTORY_HANDLE: "" + # agents: + # slurm_ntasks: 1 + # slurm_mem: 20GB + + # - label: "AMIP - Float64 test" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_single_ft64.yml --job_id coarse_single_ft64" + # artifact_paths: "experiments/ClimaEarth/output/amip/coarse_single_ft64_artifacts/*" + # agents: + # slurm_ntasks: 1 + # slurm_mem: 20GB + + # - label: "AMIP - Float32 test" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_single_ft32.yml --job_id coarse_single_ft32" + # artifact_paths: "experiments/ClimaEarth/output/amip/coarse_single_ft32_artifacts/*" + # agents: + # slurm_ntasks: 1 + # slurm_mem: 20GB + + # - label: "MPI AMIP" + # command: "srun julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/coarse_mpi_n2.yml --job_id coarse_mpi_n2" + # artifact_paths: "experiments/ClimaEarth/output/amip/coarse_mpi_n2_artifacts/*" + # timeout_in_minutes: 240 + # env: + # CLIMACORE_DISTRIBUTED: "MPI" + # agents: + # slurm_ntasks: 2 + # slurm_mem: 16GB + + + # # short high-res performance test + # - label: "Unthreaded AMIP FINE" # also reported by longruns with a flame graph + # key: "unthreaded_amip_fine" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/target_amip_n1_shortrun.yml --job_id target_amip_n1_shortrun" + # artifact_paths: "experiments/ClimaEarth/output/amip/target_amip_n1_shortrun_artifacts/*" + # env: + # BUILD_HISTORY_HANDLE: "" + # agents: + # slurm_mem: 20GB + + # # PERFORMANCE RUNS: flame graphs + allocation tests + + # - label: ":rocket: flame graph and allocation tests: perf_coarse_single_ft64" + # command: "julia --color=yes --project=perf perf/flame.jl --config_file $PERF_CONFIG_PATH/perf_coarse_single_ft64.yml --job_id perf_coarse_single_ft64" + # artifact_paths: "perf/output/perf_coarse_single_ft64/*" + # agents: + # slurm_mem: 20GB + + # - label: ":rocket: performance: flame graph diff: perf_diff_coarse_single_ft64" + # command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_coarse_single_ft64.yml --job_id perf_diff_coarse_single_ft64" + # artifact_paths: "perf/output/perf_diff_coarse_single_ft64/*" + # agents: + # slurm_mem: 20GB + + # - group: "Hierarchy tests (1d)" + # steps: + # - label: ":construction: Dry Held Suarez" + # key: "dry_held_suarez" + # command: + # - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_dry_held_suarez.jl > experiments/ClimaEarth/run_dry_held_suarez_short.jl + # - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_dry_held_suarez_short.jl" + # artifact_paths: "dry_held_suarez/dry_held_suarez/clima_atmos/*" + # agents: + # slurm_mem: 20GB + + # - label: ":construction: Moist Held Suarez" + # key: "moist_held_suarez" + # command: + # - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_moist_held_suarez.jl > experiments/ClimaEarth/run_moist_held_suarez_short.jl + # - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_moist_held_suarez_short.jl" + # artifact_paths: "moist_held_suarez/moist_held_suarez/clima_atmos/*" + # agents: + # slurm_mem: 20GB + + # - label: ":construction: Cloudless Aquaplanet" + # key: "cloudless_aquaplanet" + # command: + # - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_cloudless_aquaplanet.jl > experiments/ClimaEarth/run_cloudless_aquaplanet_short.jl + # - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_cloudless_aquaplanet_short.jl" + # artifact_paths: "cloudless_aquaplanet/cloudless_aquaplanet/clima_atmos/*" + + # agents: + # slurm_mem: 20GB + + # - label: ":construction: Cloudy Aquaplanet" + # key: "cloudy_aquaplanet" + # command: + # - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_cloudy_aquaplanet.jl > experiments/ClimaEarth/run_cloudy_aquaplanet_short.jl + # - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_cloudy_aquaplanet_short.jl" + # artifact_paths: "cloudy_aquaplanet/cloudy_aquaplanet/clima_atmos/*" + + # agents: + # slurm_mem: 20GB + + # - label: ":construction: Cloudy Slabplanet" + # key: "cloudy_slabplanet" + # command: + # - sed 's/t_end = "1000days"/t_end = "1days"/' experiments/ClimaEarth/run_cloudy_slabplanet.jl > experiments/ClimaEarth/run_cloudy_slabplanet_short.jl + # - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_cloudy_slabplanet_short.jl" + # artifact_paths: "cloudy_slabplanet/cloudy_slabplanet/clima_atmos/*" + + # agents: + # slurm_mem: 20GB + + # - wait + # - label: ":construction: Hierarchy plots" + # key: "hierarchy_plots" + # command: + # - "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/hierarchy/climate_plots.jl" + # artifact_paths: "paper_figs/*" + # agents: + # slurm_mem: 20GB - group: "GPU integration tests" steps: @@ -447,6 +448,13 @@ steps: agents: slurm_mem: 20GB slurm_gpus: 1 + - label: "2 GPU Slabplanet: albedo from function" + key: "gpu_2_slabplanet_albedo_function" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_2_slabplanet_albedo_function" + artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_function_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 2 - label: "GPU Slabplanet: albedo from static map" key: "gpu_slabplanet_albedo_static_map" @@ -457,26 +465,29 @@ steps: agents: slurm_mem: 20GB slurm_gpus: 1 - - - label: "GPU Slabplanet: albedo from temporal map" - key: "gpu_slabplanet_albedo_temporal_map" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id gpu_slabplanet_albedo_temporal_map" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_temporal_map_artifacts/*" - env: - CLIMACOMMS_DEVICE: "CUDA" - agents: - slurm_mem: 20GB - slurm_gpus: 1 - - - label: "GPU Slabplanet: extra atmos diagnostics" - key: "gpu_slabplanet_atmos_diags" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_atmos_diags.yml --job_id gpu_slabplanet_atmos_diags" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_atmos_diags_artifacts/*" - env: - CLIMACOMMS_DEVICE: "CUDA" - agents: - slurm_mem: 20GB - slurm_gpus: 1 + - label: "2 GPU Slabplanet: albedo from static map" + key: "gpu_2_slabplanet_albedo_static_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_2_slabplanet_albedo_static_map" + artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_static_map_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 2 + + # - label: "GPU Slabplanet: albedo from temporal map" + # key: "gpu_slabplanet_albedo_temporal_map" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id gpu_slabplanet_albedo_temporal_map" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_temporal_map_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 1 + + # - label: "GPU Slabplanet: extra atmos diagnostics" + # key: "gpu_slabplanet_atmos_diags" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_atmos_diags.yml --job_id gpu_slabplanet_atmos_diags" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_atmos_diags_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 1 # GPU RUNS: AMIP - label: "GPU AMIP test: albedo from function" @@ -488,16 +499,21 @@ steps: agents: slurm_mem: 20GB slurm_gpus: 1 - - - label: "GPU AMIP target: topography and diagnostic EDMF" - key: "gpu_amip_target_topo_diagedmf_shortrun" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_target_topo_diagedmf_shortrun.yml --job_id gpu_amip_target_topo_diagedmf_shortrun" - artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_target_topo_diagedmf_shortrun_artifacts/*" - env: - CLIMACOMMS_DEVICE: "CUDA" + - label: "2 GPU AMIP test: albedo from function" + key: "gpu_2_amip_albedo_function" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_2_amip_albedo_function" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_function_artifacts/*" agents: slurm_mem: 20GB - slurm_gpus: 1 + slurm_gpus: 2 + + # - label: "GPU AMIP target: topography and diagnostic EDMF" + # key: "gpu_amip_target_topo_diagedmf_shortrun" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_target_topo_diagedmf_shortrun.yml --job_id gpu_amip_target_topo_diagedmf_shortrun" + # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_target_topo_diagedmf_shortrun_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 1 - label: "GPU AMIP: albedo from static map" key: "gpu_amip_albedo_static_map" @@ -508,31 +524,36 @@ steps: agents: slurm_mem: 20GB slurm_gpus: 1 - - - label: "GPU AMIP: albedo from temporal map" - key: "gpu_amip_albedo_temporal_map" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_temporal_map.yml --job_id gpu_amip_albedo_temporal_map" - artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_temporal_map_artifacts/*" - env: - CLIMACOMMS_DEVICE: "CUDA" - agents: - slurm_mem: 20GB - slurm_gpus: 1 - - - group: "Bash scripts" - steps: - - label: "Submit and Monitor sbatch Job on Caltech HPC" - # check that (1) the script can be succesfully submitted, (2) it runs successfully - command: "test/mpi_tests/test_sbatch_script.sh" - agents: - slurm_ntasks: 1 - soft_fail: true - - - wait - - # plot job performance history - - label: ":chart_with_downwards_trend: build history" - command: - - build_history staging # name of branch to plot - artifact_paths: - - "build_history.html" + - label: "2 GPU AMIP: albedo from static map" + key: "gpu_2_amip_albedo_static_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_2_amip_albedo_static_map" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_static_map_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 2 + + # - label: "GPU AMIP: albedo from temporal map" + # key: "gpu_amip_albedo_temporal_map" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_temporal_map.yml --job_id gpu_amip_albedo_temporal_map" + # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_temporal_map_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 1 + + # - group: "Bash scripts" + # steps: + # - label: "Submit and Monitor sbatch Job on Caltech HPC" + # # check that (1) the script can be succesfully submitted, (2) it runs successfully + # command: "test/mpi_tests/test_sbatch_script.sh" + # agents: + # slurm_ntasks: 1 + # soft_fail: true + + # - wait + + # # plot job performance history + # - label: ":chart_with_downwards_trend: build history" + # command: + # - build_history staging # name of branch to plot + # artifact_paths: + # - "build_history.html" From ac2f82cd710e2a94af7820189d9e536aca0c1261 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Tue, 18 Jun 2024 15:37:29 -0700 Subject: [PATCH 02/16] add 2 GPU runs to benchmark pipeline --- .buildkite/benchmarks/pipeline.yml | 142 ++++++++++++++++------------- 1 file changed, 77 insertions(+), 65 deletions(-) diff --git a/.buildkite/benchmarks/pipeline.yml b/.buildkite/benchmarks/pipeline.yml index f6fddc80d..176d48c3a 100644 --- a/.buildkite/benchmarks/pipeline.yml +++ b/.buildkite/benchmarks/pipeline.yml @@ -41,46 +41,43 @@ steps: - wait - - group: "CPU benchmarks" - steps: - - label: "CPU ClimaAtmos without diagnostic EDMF" - key: "climaatmos" - command: "srun julia --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos.yml --job_id climaatmos" - artifact_paths: "experiments/ClimaEarth/output/climaatmos/climaatmos_artifacts/*" - env: - CLIMACOMMS_CONTEXT: "MPI" - BUILD_HISTORY_HANDLE: "" - CLIMACOMMS_DEVICE: "CPU" - agents: - slurm_ntasks_per_node: 64 - slurm_nodes: 1 - slurm_mem_per_cpu: 4GB + # - group: "CPU benchmarks" + # steps: + # - label: "CPU ClimaAtmos without diagnostic EDMF" + # key: "climaatmos" + # command: "srun julia --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos.yml --job_id climaatmos" + # artifact_paths: "experiments/ClimaEarth/output/climaatmos/climaatmos_artifacts/*" + # env: + # BUILD_HISTORY_HANDLE: "" + # CLIMACOMMS_DEVICE: "CPU" + # agents: + # slurm_ntasks_per_node: 64 + # slurm_nodes: 1 + # slurm_mem_per_cpu: 4GB - - label: "CPU ClimaAtmos with diagnostic EDMF" - key: "climaatmos_diagedmf" - command: "srun julia --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos_diagedmf.yml --job_id climaatmos_diagedmf" - artifact_paths: "experiments/ClimaEarth/output/climaatmos/climaatmos_diagedmf_artifacts/*" - env: - CLIMACOMMS_CONTEXT: "MPI" - BUILD_HISTORY_HANDLE: "" - CLIMACOMMS_DEVICE: "CPU" - agents: - slurm_ntasks_per_node: 64 - slurm_nodes: 1 - slurm_mem_per_cpu: 4GB + # - label: "CPU ClimaAtmos with diagnostic EDMF" + # key: "climaatmos_diagedmf" + # command: "srun julia --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos_diagedmf.yml --job_id climaatmos_diagedmf" + # artifact_paths: "experiments/ClimaEarth/output/climaatmos/climaatmos_diagedmf_artifacts/*" + # env: + # BUILD_HISTORY_HANDLE: "" + # CLIMACOMMS_DEVICE: "CPU" + # agents: + # slurm_ntasks_per_node: 64 + # slurm_nodes: 1 + # slurm_mem_per_cpu: 4GB - - label: "CPU AMIP with diagnostic EDMF" - key: "amip_diagedmf" - command: "srun julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id amip_diagedmf" - artifact_paths: "experiments/ClimaEarth/output/amip/amip_diagedmf_artifacts/*" - env: - CLIMACOMMS_CONTEXT: "MPI" - BUILD_HISTORY_HANDLE: "" - CLIMACOMMS_DEVICE: "CPU" - agents: - slurm_ntasks_per_node: 64 - slurm_nodes: 1 - slurm_mem_per_cpu: 4GB + # - label: "CPU AMIP with diagnostic EDMF" + # key: "amip_diagedmf" + # command: "srun julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id amip_diagedmf" + # artifact_paths: "experiments/ClimaEarth/output/amip/amip_diagedmf_artifacts/*" + # env: + # BUILD_HISTORY_HANDLE: "" + # CLIMACOMMS_DEVICE: "CPU" + # agents: + # slurm_ntasks_per_node: 64 + # slurm_nodes: 1 + # slurm_mem_per_cpu: 4GB - group: "GPU benchmarks" steps: @@ -96,20 +93,26 @@ steps: slurm_cpus_per_task: 4 slurm_ntasks: 4 slurm_mem: 16GB - - - label: "GPU ClimaAtmos with diagnostic EDMF" - key: "gpu_climaatmos_diagedmf" - command: "srun julia --threads=3 --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos_diagedmf.yml --job_id gpu_climaatmos_diagedmf" - artifact_paths: "experiments/ClimaEarth/output/climaatmos/gpu_climaatmos_diagedmf_artifacts/*" - env: - CLIMACOMMS_CONTEXT: "MPI" - CLIMACOMMS_DEVICE: "CUDA" + - label: "2 GPU ClimaAtmos without diagnostic EDMF" + key: "gpu_2_climaatmos" + command: "srun julia --threads=3 --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos.yml --job_id gpu_2_climaatmos" + artifact_paths: "experiments/ClimaEarth/output/climaatmos/gpu_2_climaatmos_artifacts/*" agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 - slurm_ntasks: 4 + slurm_ntasks: 2 slurm_mem: 16GB + # - label: "GPU ClimaAtmos with diagnostic EDMF" + # key: "gpu_climaatmos_diagedmf" + # command: "srun julia --threads=3 --color=yes --project=test/ test/component_model_tests/climaatmos_standalone/atmos_driver.jl --config_file $BENCHMARK_CONFIG_PATH/climaatmos_diagedmf.yml --job_id gpu_climaatmos_diagedmf" + # artifact_paths: "experiments/ClimaEarth/output/climaatmos/gpu_climaatmos_diagedmf_artifacts/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 16GB + - label: "GPU AMIP with diagnostic EDMF" key: "gpu_amip_diagedmf" command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id gpu_amip_diagedmf" @@ -122,23 +125,32 @@ steps: slurm_cpus_per_task: 4 slurm_ntasks: 4 slurm_mem: 16GB + - label: "2 GPU AMIP with diagnostic EDMF" + key: "gpu_2_amip_diagedmf" + command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id gpu_2_amip_diagedmf" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_diagedmf_artifacts/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 16GB - - group: "Generate output table" - steps: - - label: "Compare AMIP/Atmos-only with diagnostic EDMF" - key: "compare_amip_climaatmos_amip_diagedmf" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/user_io/benchmarks.jl --cpu_job_id_coupled amip_diagedmf --cpu_job_id_atmos_diagedmf climaatmos_diagedmf --cpu_job_id_atmos climaatmos --build_id $BUILDKITE_BUILD_NUMBER" - artifact_paths: "experiments/ClimaEarth/output/compare_amip_climaatmos_amip_diagedmf/*" - depends_on: - - "climaatmos" - - "climaatmos_diagedmf" - - "amip_diagedmf" - - "gpu_climaatmos" - - "gpu_climaatmos_diagedmf" - - "gpu_amip_diagedmf" + # - group: "Generate output table" + # steps: + # - label: "Compare AMIP/Atmos-only with diagnostic EDMF" + # key: "compare_amip_climaatmos_amip_diagedmf" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/user_io/benchmarks.jl --cpu_job_id_coupled amip_diagedmf --cpu_job_id_atmos_diagedmf climaatmos_diagedmf --cpu_job_id_atmos climaatmos --build_id $BUILDKITE_BUILD_NUMBER" + # artifact_paths: "experiments/ClimaEarth/output/compare_amip_climaatmos_amip_diagedmf/*" + # depends_on: + # - "climaatmos" + # - "climaatmos_diagedmf" + # - "amip_diagedmf" + # - "gpu_climaatmos" + # - "gpu_climaatmos_diagedmf" + # - "gpu_amip_diagedmf" - - label: ":envelope: Slack report: CPU/GPU AMIP/Atmos-only table" - depends_on: - - "compare_amip_climaatmos_amip_diagedmf" - command: - - slack-upload -c "#coupler-report" -f experiments/ClimaEarth/output/compare_amip_climaatmos_amip_diagedmf/table.txt -m txt -n compare_amip_climaatmos_amip_diagedmf_table -x "Coupler CPU/GPU Comparison Table" + # - label: ":envelope: Slack report: CPU/GPU AMIP/Atmos-only table" + # depends_on: + # - "compare_amip_climaatmos_amip_diagedmf" + # command: + # - slack-upload -c "#coupler-report" -f experiments/ClimaEarth/output/compare_amip_climaatmos_amip_diagedmf/table.txt -m txt -n compare_amip_climaatmos_amip_diagedmf_table -x "Coupler CPU/GPU Comparison Table" From 99a3fb6df1431b7eeb002f239aeeca8061d753e7 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Tue, 18 Jun 2024 16:14:44 -0700 Subject: [PATCH 03/16] run benchmark run on new-central --- .buildkite/pipeline.yml | 173 ++++++++++++++++++++++------------------ 1 file changed, 96 insertions(+), 77 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a48d6aca2..54cb79092 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -15,6 +15,7 @@ env: CONFIG_PATH: "config/ci_configs" PERF_CONFIG_PATH: "config/perf_configs" + BENCHMARK_CONFIG_PATH: "config/benchmark_configs" timeout_in_minutes: 240 @@ -438,99 +439,117 @@ steps: - group: "GPU integration tests" steps: - # GPU RUNS: slabplanet - - label: "GPU Slabplanet: albedo from function" - key: "gpu_slabplanet_albedo_function" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_slabplanet_albedo_function" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_function_artifacts/*" - env: - CLIMACOMMS_DEVICE: "CUDA" + - label: "GPU AMIP with diagnostic EDMF" + key: "gpu_amip_diagedmf" + command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id gpu_amip_diagedmf" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_diagedmf_artifacts/*" agents: - slurm_mem: 20GB - slurm_gpus: 1 - - label: "2 GPU Slabplanet: albedo from function" - key: "gpu_2_slabplanet_albedo_function" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_2_slabplanet_albedo_function" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_function_artifacts/*" - agents: - slurm_mem: 20GB - slurm_gpus: 2 - - - label: "GPU Slabplanet: albedo from static map" - key: "gpu_slabplanet_albedo_static_map" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_slabplanet_albedo_static_map" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_static_map_artifacts/*" - env: - CLIMACOMMS_DEVICE: "CUDA" + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 16GB + - label: "2 GPU AMIP with diagnostic EDMF" + key: "gpu_2_amip_diagedmf" + command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id gpu_2_amip_diagedmf" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_diagedmf_artifacts/*" agents: - slurm_mem: 20GB - slurm_gpus: 1 - - label: "2 GPU Slabplanet: albedo from static map" - key: "gpu_2_slabplanet_albedo_static_map" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_2_slabplanet_albedo_static_map" - artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_static_map_artifacts/*" + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 16GB + - label: "2 GPU AMIP with diagnostic EDMF" + key: "gpu_2_ngpus_amip_diagedmf" + command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id gpu_2_ngpus_amip_diagedmf" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_ngpus_amip_diagedmf_artifacts/*" agents: slurm_mem: 20GB slurm_gpus: 2 - # - label: "GPU Slabplanet: albedo from temporal map" - # key: "gpu_slabplanet_albedo_temporal_map" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id gpu_slabplanet_albedo_temporal_map" - # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_temporal_map_artifacts/*" + # # GPU RUNS: slabplanet + # - label: "GPU Slabplanet: albedo from function" + # key: "gpu_slabplanet_albedo_function" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_slabplanet_albedo_function" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_function_artifacts/*" # agents: # slurm_mem: 20GB # slurm_gpus: 1 + # - label: "2 GPU Slabplanet: albedo from function" + # key: "gpu_2_slabplanet_albedo_function" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_2_slabplanet_albedo_function" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_function_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 2 - # - label: "GPU Slabplanet: extra atmos diagnostics" - # key: "gpu_slabplanet_atmos_diags" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_atmos_diags.yml --job_id gpu_slabplanet_atmos_diags" - # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_atmos_diags_artifacts/*" + # - label: "GPU Slabplanet: albedo from static map" + # key: "gpu_slabplanet_albedo_static_map" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_slabplanet_albedo_static_map" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_static_map_artifacts/*" # agents: # slurm_mem: 20GB # slurm_gpus: 1 - - # GPU RUNS: AMIP - - label: "GPU AMIP test: albedo from function" - key: "gpu_amip_albedo_function" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_amip_albedo_function" - artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_function_artifacts/*" - env: - CLIMACOMMS_DEVICE: "CUDA" - agents: - slurm_mem: 20GB - slurm_gpus: 1 - - label: "2 GPU AMIP test: albedo from function" - key: "gpu_2_amip_albedo_function" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_2_amip_albedo_function" - artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_function_artifacts/*" - agents: - slurm_mem: 20GB - slurm_gpus: 2 - - # - label: "GPU AMIP target: topography and diagnostic EDMF" - # key: "gpu_amip_target_topo_diagedmf_shortrun" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_target_topo_diagedmf_shortrun.yml --job_id gpu_amip_target_topo_diagedmf_shortrun" - # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_target_topo_diagedmf_shortrun_artifacts/*" + # - label: "2 GPU Slabplanet: albedo from static map" + # key: "gpu_2_slabplanet_albedo_static_map" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_2_slabplanet_albedo_static_map" + # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_static_map_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 2 + + # # - label: "GPU Slabplanet: albedo from temporal map" + # # key: "gpu_slabplanet_albedo_temporal_map" + # # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id gpu_slabplanet_albedo_temporal_map" + # # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_temporal_map_artifacts/*" + # # agents: + # # slurm_mem: 20GB + # # slurm_gpus: 1 + + # # - label: "GPU Slabplanet: extra atmos diagnostics" + # # key: "gpu_slabplanet_atmos_diags" + # # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_atmos_diags.yml --job_id gpu_slabplanet_atmos_diags" + # # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_atmos_diags_artifacts/*" + # # agents: + # # slurm_mem: 20GB + # # slurm_gpus: 1 + + # # GPU RUNS: AMIP + # - label: "GPU AMIP test: albedo from function" + # key: "gpu_amip_albedo_function" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_amip_albedo_function" + # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_function_artifacts/*" # agents: # slurm_mem: 20GB # slurm_gpus: 1 - - - label: "GPU AMIP: albedo from static map" - key: "gpu_amip_albedo_static_map" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_amip_albedo_static_map" - artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_static_map_artifacts/*" - env: - CLIMACOMMS_DEVICE: "CUDA" - agents: - slurm_mem: 20GB - slurm_gpus: 1 - - label: "2 GPU AMIP: albedo from static map" - key: "gpu_2_amip_albedo_static_map" - command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_2_amip_albedo_static_map" - artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_static_map_artifacts/*" - agents: - slurm_mem: 20GB - slurm_gpus: 2 + # - label: "2 GPU AMIP test: albedo from function" + # key: "gpu_2_amip_albedo_function" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_2_amip_albedo_function" + # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_function_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 2 + + # # - label: "GPU AMIP target: topography and diagnostic EDMF" + # # key: "gpu_amip_target_topo_diagedmf_shortrun" + # # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_target_topo_diagedmf_shortrun.yml --job_id gpu_amip_target_topo_diagedmf_shortrun" + # # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_target_topo_diagedmf_shortrun_artifacts/*" + # # agents: + # # slurm_mem: 20GB + # # slurm_gpus: 1 + + # - label: "GPU AMIP: albedo from static map" + # key: "gpu_amip_albedo_static_map" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_amip_albedo_static_map" + # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_static_map_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 1 + # - label: "2 GPU AMIP: albedo from static map" + # key: "gpu_2_amip_albedo_static_map" + # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_2_amip_albedo_static_map" + # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_static_map_artifacts/*" + # agents: + # slurm_mem: 20GB + # slurm_gpus: 2 # - label: "GPU AMIP: albedo from temporal map" # key: "gpu_amip_albedo_temporal_map" From d74934b67c8b251be2ee3ce5c82501d273ad81a0 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Tue, 18 Jun 2024 17:11:19 -0700 Subject: [PATCH 04/16] actually run on 2 gpus --- .buildkite/pipeline.yml | 131 ++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 65 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 54cb79092..41ba21f1d 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -457,44 +457,41 @@ steps: slurm_cpus_per_task: 4 slurm_ntasks: 2 slurm_mem: 16GB - - label: "2 GPU AMIP with diagnostic EDMF" - key: "gpu_2_ngpus_amip_diagedmf" - command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id gpu_2_ngpus_amip_diagedmf" - artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_ngpus_amip_diagedmf_artifacts/*" + + # GPU RUNS: slabplanet + - label: "GPU Slabplanet: albedo from function" + key: "gpu_slabplanet_albedo_function" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_slabplanet_albedo_function" + artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_function_artifacts/*" agents: slurm_mem: 20GB - slurm_gpus: 2 - - # # GPU RUNS: slabplanet - # - label: "GPU Slabplanet: albedo from function" - # key: "gpu_slabplanet_albedo_function" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_slabplanet_albedo_function" - # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_function_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 1 - # - label: "2 GPU Slabplanet: albedo from function" - # key: "gpu_2_slabplanet_albedo_function" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_2_slabplanet_albedo_function" - # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_function_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 2 - - # - label: "GPU Slabplanet: albedo from static map" - # key: "gpu_slabplanet_albedo_static_map" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_slabplanet_albedo_static_map" - # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_static_map_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 1 - # - label: "2 GPU Slabplanet: albedo from static map" - # key: "gpu_2_slabplanet_albedo_static_map" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_2_slabplanet_albedo_static_map" - # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_static_map_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 2 + slurm_gpus: 1 + - label: "2 GPU Slabplanet: albedo from function" + key: "gpu_2_slabplanet_albedo_function" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_function.yml --job_id gpu_2_slabplanet_albedo_function" + artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_function_artifacts/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 16GB + + - label: "GPU Slabplanet: albedo from static map" + key: "gpu_slabplanet_albedo_static_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_slabplanet_albedo_static_map" + artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_static_map_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 1 + - label: "2 GPU Slabplanet: albedo from static map" + key: "gpu_2_slabplanet_albedo_static_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_static_map.yml --job_id gpu_2_slabplanet_albedo_static_map" + artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_static_map_artifacts/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 16GB # # - label: "GPU Slabplanet: albedo from temporal map" # # key: "gpu_slabplanet_albedo_temporal_map" @@ -512,21 +509,23 @@ steps: # # slurm_mem: 20GB # # slurm_gpus: 1 - # # GPU RUNS: AMIP - # - label: "GPU AMIP test: albedo from function" - # key: "gpu_amip_albedo_function" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_amip_albedo_function" - # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_function_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 1 - # - label: "2 GPU AMIP test: albedo from function" - # key: "gpu_2_amip_albedo_function" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_2_amip_albedo_function" - # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_function_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 2 + # GPU RUNS: AMIP + - label: "GPU AMIP test: albedo from function" + key: "gpu_amip_albedo_function" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_amip_albedo_function" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_function_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 1 + - label: "2 GPU AMIP test: albedo from function" + key: "gpu_2_amip_albedo_function" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_function.yml --job_id gpu_2_amip_albedo_function" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_function_artifacts/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 16GB # # - label: "GPU AMIP target: topography and diagnostic EDMF" # # key: "gpu_amip_target_topo_diagedmf_shortrun" @@ -536,20 +535,22 @@ steps: # # slurm_mem: 20GB # # slurm_gpus: 1 - # - label: "GPU AMIP: albedo from static map" - # key: "gpu_amip_albedo_static_map" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_amip_albedo_static_map" - # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_static_map_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 1 - # - label: "2 GPU AMIP: albedo from static map" - # key: "gpu_2_amip_albedo_static_map" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_2_amip_albedo_static_map" - # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_static_map_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 2 + - label: "GPU AMIP: albedo from static map" + key: "gpu_amip_albedo_static_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_amip_albedo_static_map" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_static_map_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 1 + - label: "2 GPU AMIP: albedo from static map" + key: "gpu_2_amip_albedo_static_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_static_map.yml --job_id gpu_2_amip_albedo_static_map" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_static_map_artifacts/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 16GB # - label: "GPU AMIP: albedo from temporal map" # key: "gpu_amip_albedo_temporal_map" From a9fa01d4b743f8ca7be02915ee3e151d8cd8ba6d Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Thu, 20 Jun 2024 15:25:40 -0700 Subject: [PATCH 05/16] add prints --- src/Regridder.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/Regridder.jl b/src/Regridder.jl index bef60e3b9..d3a8dafc3 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -513,6 +513,27 @@ function update_surface_fractions!(cs::Interfacer.CoupledSimulation) cs.surface_fractions.ice .= max.(min.(ice_d, FT(1) .- land_s), FT(0)) cs.surface_fractions.ocean .= max.(FT(1) .- (cs.surface_fractions.ice .+ land_s), FT(0)) + if !(minimum(cs.surface_fractions.ice .+ land_s .+ cs.surface_fractions.ocean) ≈ FT(1)) + @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + + aland = Array(parent(land_s)) + aice = Array(parent(cs.surface_fractions.ice)) + aocean = Array(parent(cs.surface_fractions.ocean)) + for i in eachindex(Array(parent(land_s))) + land_i = aland[i] + ice_i = aice[i] + ocean_i = aocean[i] + + # ice = max(min(ice_i, FT(1) - land_i), FT(0)) + if !(ice_i + land_i + ocean_i ≈ FT(1)) + @show "INVALID SUM AT INDEX $i" + @show ice_i, land_i, ocean_i + println() + end + end + end + @assert minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) @assert maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) From 61dca1c089bdc90be7ab90e27c953e0c7c7d7810 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Thu, 20 Jun 2024 18:02:29 -0700 Subject: [PATCH 06/16] try land from cs --- src/Regridder.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Regridder.jl b/src/Regridder.jl index d3a8dafc3..8aa2f1b16 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -517,7 +517,7 @@ function update_surface_fractions!(cs::Interfacer.CoupledSimulation) @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) - aland = Array(parent(land_s)) + aland = Array(parent(cs.surface_fractions.land)) aice = Array(parent(cs.surface_fractions.ice)) aocean = Array(parent(cs.surface_fractions.ocean)) for i in eachindex(Array(parent(land_s))) From db32e8fb7116cae55218ceea1fdfe5b5ce881b08 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 10:11:04 -0700 Subject: [PATCH 07/16] more printing --- src/Regridder.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Regridder.jl b/src/Regridder.jl index 8aa2f1b16..2ea07c170 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -515,12 +515,14 @@ function update_surface_fractions!(cs::Interfacer.CoupledSimulation) if !(minimum(cs.surface_fractions.ice .+ land_s .+ cs.surface_fractions.ocean) ≈ FT(1)) @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + @show minimum(parent(cs.surface_fractions.ice) .+ parent(cs.surface_fractions.land) .+ parent(cs.surface_fractions.ocean)) + @show argmin(parent(cs.surface_fractions.ice) .+ parent(cs.surface_fractions.land) .+ parent(cs.surface_fractions.ocean)) @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) aland = Array(parent(cs.surface_fractions.land)) aice = Array(parent(cs.surface_fractions.ice)) aocean = Array(parent(cs.surface_fractions.ocean)) - for i in eachindex(Array(parent(land_s))) + for i in eachindex(aland) land_i = aland[i] ice_i = aice[i] ocean_i = aocean[i] From 03458a5c4e0ddb9f0f4f0b5edba624ff64ce0a89 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 10:40:18 -0700 Subject: [PATCH 08/16] show extrema --- src/Regridder.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Regridder.jl b/src/Regridder.jl index 2ea07c170..cf4a45c67 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -522,6 +522,14 @@ function update_surface_fractions!(cs::Interfacer.CoupledSimulation) aland = Array(parent(cs.surface_fractions.land)) aice = Array(parent(cs.surface_fractions.ice)) aocean = Array(parent(cs.surface_fractions.ocean)) + + @show extrema(aland) + @show extrema(cs.surface_fractions.land) + @show extrema(aice) + @show extrema(cs.surface_fractions.ice) + @show extrema(aocean) + @show extrema(cs.surface_fractions.ocean) + for i in eachindex(aland) land_i = aland[i] ice_i = aice[i] From 0f53b22daf03fbc5d90b63d03dbb8e7b42d36627 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 11:29:10 -0700 Subject: [PATCH 09/16] add temporal map runs --- .buildkite/pipeline.yml | 44 ++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 41ba21f1d..c8002ffed 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -493,13 +493,22 @@ steps: slurm_ntasks: 2 slurm_mem: 16GB - # # - label: "GPU Slabplanet: albedo from temporal map" - # # key: "gpu_slabplanet_albedo_temporal_map" - # # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id gpu_slabplanet_albedo_temporal_map" - # # artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_temporal_map_artifacts/*" - # # agents: - # # slurm_mem: 20GB - # # slurm_gpus: 1 + - label: "GPU Slabplanet: albedo from temporal map" + key: "gpu_slabplanet_albedo_temporal_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id gpu_slabplanet_albedo_temporal_map" + artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_slabplanet_albedo_temporal_map_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 1 + - label: "2 GPU Slabplanet: albedo from temporal map" + key: "gpu_2_slabplanet_albedo_temporal_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/slabplanet_albedo_temporal_map.yml --job_id gpu_2_slabplanet_albedo_temporal_map" + artifact_paths: "experiments/ClimaEarth/output/slabplanet/gpu_2_slabplanet_albedo_temporal_map_artifacts/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 16GB # # - label: "GPU Slabplanet: extra atmos diagnostics" # # key: "gpu_slabplanet_atmos_diags" @@ -552,13 +561,20 @@ steps: slurm_ntasks: 2 slurm_mem: 16GB - # - label: "GPU AMIP: albedo from temporal map" - # key: "gpu_amip_albedo_temporal_map" - # command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_temporal_map.yml --job_id gpu_amip_albedo_temporal_map" - # artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_temporal_map_artifacts/*" - # agents: - # slurm_mem: 20GB - # slurm_gpus: 1 + - label: "GPU AMIP: albedo from temporal map" + key: "gpu_amip_albedo_temporal_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_temporal_map.yml --job_id gpu_amip_albedo_temporal_map" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_amip_albedo_temporal_map_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 1 + - label: "2 GPU AMIP: albedo from temporal map" + key: "gpu_2_amip_albedo_temporal_map" + command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $CONFIG_PATH/gpu_amip_albedo_temporal_map.yml --job_id gpu_2_amip_albedo_temporal_map" + artifact_paths: "experiments/ClimaEarth/output/amip/gpu_2_amip_albedo_temporal_map_artifacts/*" + agents: + slurm_mem: 20GB + slurm_gpus: 1 # - group: "Bash scripts" # steps: From 6aec6fb436ad328ec79e405da8151d60dbfeaa45 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 11:59:03 -0700 Subject: [PATCH 10/16] use h_elem 30 --- config/ci_configs/gpu_amip_albedo_function.yml | 2 +- config/ci_configs/gpu_amip_albedo_static_map.yml | 2 +- config/ci_configs/gpu_amip_albedo_temporal_map.yml | 2 +- config/ci_configs/slabplanet_albedo_function.yml | 2 +- config/ci_configs/slabplanet_albedo_static_map.yml | 2 +- config/ci_configs/slabplanet_albedo_temporal_map.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config/ci_configs/gpu_amip_albedo_function.yml b/config/ci_configs/gpu_amip_albedo_function.yml index 5f0fe3305..2a83b0ad0 100644 --- a/config/ci_configs/gpu_amip_albedo_function.yml +++ b/config/ci_configs/gpu_amip_albedo_function.yml @@ -7,7 +7,7 @@ dt_rad: "1hours" dt_save_to_sol: "1days" dz_bottom: 30 dz_top: 3000 -h_elem: 4 +h_elem: 30 land_albedo_type: "function" mode_name: "amip" moist: "equil" diff --git a/config/ci_configs/gpu_amip_albedo_static_map.yml b/config/ci_configs/gpu_amip_albedo_static_map.yml index f401e573f..b3ee8ea7b 100644 --- a/config/ci_configs/gpu_amip_albedo_static_map.yml +++ b/config/ci_configs/gpu_amip_albedo_static_map.yml @@ -7,7 +7,7 @@ dt_rad: "1hours" dt_save_to_sol: "1days" dz_bottom: 30 dz_top: 3000 -h_elem: 4 +h_elem: 30 land_albedo_type: "map_static" mode_name: "amip" moist: "equil" diff --git a/config/ci_configs/gpu_amip_albedo_temporal_map.yml b/config/ci_configs/gpu_amip_albedo_temporal_map.yml index af5edfeb2..15d3bcdb9 100644 --- a/config/ci_configs/gpu_amip_albedo_temporal_map.yml +++ b/config/ci_configs/gpu_amip_albedo_temporal_map.yml @@ -7,7 +7,7 @@ dt_rad: "1hours" dt_save_to_sol: "1days" dz_bottom: 30 dz_top: 3000 -h_elem: 4 +h_elem: 30 land_albedo_type: "map_temporal" mode_name: "amip" moist: "equil" diff --git a/config/ci_configs/slabplanet_albedo_function.yml b/config/ci_configs/slabplanet_albedo_function.yml index 7b499e100..7eb074cfa 100644 --- a/config/ci_configs/slabplanet_albedo_function.yml +++ b/config/ci_configs/slabplanet_albedo_function.yml @@ -3,7 +3,7 @@ dt: "200secs" dt_cpl: 200 dt_save_to_sol: "3600secs" energy_check: true -h_elem: 4 +h_elem: 30 land_albedo_type: "function" mode_name: "slabplanet" moist: "equil" diff --git a/config/ci_configs/slabplanet_albedo_static_map.yml b/config/ci_configs/slabplanet_albedo_static_map.yml index 4a192b93e..8c778a285 100644 --- a/config/ci_configs/slabplanet_albedo_static_map.yml +++ b/config/ci_configs/slabplanet_albedo_static_map.yml @@ -3,7 +3,7 @@ dt: "200secs" dt_cpl: 200 dt_save_to_sol: "3600secs" energy_check: true -h_elem: 4 +h_elem: 30 land_albedo_type: "map_static" mode_name: "slabplanet" moist: "equil" diff --git a/config/ci_configs/slabplanet_albedo_temporal_map.yml b/config/ci_configs/slabplanet_albedo_temporal_map.yml index 96ea2d5c4..f0264be82 100644 --- a/config/ci_configs/slabplanet_albedo_temporal_map.yml +++ b/config/ci_configs/slabplanet_albedo_temporal_map.yml @@ -3,7 +3,7 @@ dt: "200secs" dt_cpl: 200 dt_save_to_sol: "3600secs" energy_check: true -h_elem: 4 +h_elem: 30 land_albedo_type: "map_temporal" mode_name: "slabplanet" moist: "equil" From 923d19d3309acc9d703763db5c48b06ac94f348b Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 12:03:39 -0700 Subject: [PATCH 11/16] show min(parent(sum)) --- src/Regridder.jl | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Regridder.jl b/src/Regridder.jl index cf4a45c67..5f27c08a7 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -516,20 +516,13 @@ function update_surface_fractions!(cs::Interfacer.CoupledSimulation) if !(minimum(cs.surface_fractions.ice .+ land_s .+ cs.surface_fractions.ocean) ≈ FT(1)) @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) @show minimum(parent(cs.surface_fractions.ice) .+ parent(cs.surface_fractions.land) .+ parent(cs.surface_fractions.ocean)) - @show argmin(parent(cs.surface_fractions.ice) .+ parent(cs.surface_fractions.land) .+ parent(cs.surface_fractions.ocean)) + @show minimum(parent(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean)) @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) aland = Array(parent(cs.surface_fractions.land)) aice = Array(parent(cs.surface_fractions.ice)) aocean = Array(parent(cs.surface_fractions.ocean)) - @show extrema(aland) - @show extrema(cs.surface_fractions.land) - @show extrema(aice) - @show extrema(cs.surface_fractions.ice) - @show extrema(aocean) - @show extrema(cs.surface_fractions.ocean) - for i in eachindex(aland) land_i = aland[i] ice_i = aice[i] From f0406e778a35f805089e60fe7f7c627ecf412c7d Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 13:06:03 -0700 Subject: [PATCH 12/16] use float32 --- config/ci_configs/gpu_amip_albedo_function.yml | 1 + config/ci_configs/gpu_amip_albedo_static_map.yml | 1 + config/ci_configs/gpu_amip_albedo_temporal_map.yml | 1 + config/ci_configs/slabplanet_albedo_function.yml | 1 + config/ci_configs/slabplanet_albedo_static_map.yml | 1 + config/ci_configs/slabplanet_albedo_temporal_map.yml | 1 + 6 files changed, 6 insertions(+) diff --git a/config/ci_configs/gpu_amip_albedo_function.yml b/config/ci_configs/gpu_amip_albedo_function.yml index 2a83b0ad0..c93664cd0 100644 --- a/config/ci_configs/gpu_amip_albedo_function.yml +++ b/config/ci_configs/gpu_amip_albedo_function.yml @@ -1,3 +1,4 @@ +FLOAT_TYPE: "Float32" alpha_rayleigh_uh: 0 alpha_rayleigh_w: 10 apply_limiter: false diff --git a/config/ci_configs/gpu_amip_albedo_static_map.yml b/config/ci_configs/gpu_amip_albedo_static_map.yml index b3ee8ea7b..3d9a66b46 100644 --- a/config/ci_configs/gpu_amip_albedo_static_map.yml +++ b/config/ci_configs/gpu_amip_albedo_static_map.yml @@ -1,3 +1,4 @@ +FLOAT_TYPE: "Float32" alpha_rayleigh_uh: 0 alpha_rayleigh_w: 10 apply_limiter: false diff --git a/config/ci_configs/gpu_amip_albedo_temporal_map.yml b/config/ci_configs/gpu_amip_albedo_temporal_map.yml index 15d3bcdb9..236ad5d42 100644 --- a/config/ci_configs/gpu_amip_albedo_temporal_map.yml +++ b/config/ci_configs/gpu_amip_albedo_temporal_map.yml @@ -1,3 +1,4 @@ +FLOAT_TYPE: "Float32" alpha_rayleigh_uh: 0 alpha_rayleigh_w: 10 apply_limiter: false diff --git a/config/ci_configs/slabplanet_albedo_function.yml b/config/ci_configs/slabplanet_albedo_function.yml index 7eb074cfa..beadfa76c 100644 --- a/config/ci_configs/slabplanet_albedo_function.yml +++ b/config/ci_configs/slabplanet_albedo_function.yml @@ -1,3 +1,4 @@ +FLOAT_TYPE: "Float32" apply_limiter: false dt: "200secs" dt_cpl: 200 diff --git a/config/ci_configs/slabplanet_albedo_static_map.yml b/config/ci_configs/slabplanet_albedo_static_map.yml index 8c778a285..fa5788287 100644 --- a/config/ci_configs/slabplanet_albedo_static_map.yml +++ b/config/ci_configs/slabplanet_albedo_static_map.yml @@ -1,3 +1,4 @@ +FLOAT_TYPE: "Float32" apply_limiter: false dt: "200secs" dt_cpl: 200 diff --git a/config/ci_configs/slabplanet_albedo_temporal_map.yml b/config/ci_configs/slabplanet_albedo_temporal_map.yml index f0264be82..91a30b3cf 100644 --- a/config/ci_configs/slabplanet_albedo_temporal_map.yml +++ b/config/ci_configs/slabplanet_albedo_temporal_map.yml @@ -1,3 +1,4 @@ +FLOAT_TYPE: "Float32" apply_limiter: false dt: "200secs" dt_cpl: 200 From 212f9f5b32784d23882f312362968148c7d299d4 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 14:07:37 -0700 Subject: [PATCH 13/16] add mre --- debug_2gpu.jl | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 debug_2gpu.jl diff --git a/debug_2gpu.jl b/debug_2gpu.jl new file mode 100644 index 000000000..a01cb6b3c --- /dev/null +++ b/debug_2gpu.jl @@ -0,0 +1,106 @@ +# Approaches: +# - Find index that's not matching +# - output data to NC file, reload nad + +import ClimaComms +ClimaComms.@import_required_backends +import ClimaCore: Domains, Meshes, Quadratures, Topologies, Spaces, Fields, DataLayouts + +function make_space(comms_ctx, FT) + # Set up parameters + h_elem = 30 + radius = FT(6731e3) + nh_poly = 3 + + # Create space components + domain = Domains.SphereDomain(radius) + mesh = Meshes.EquiangularCubedSphere(domain, h_elem) + quad = Quadratures.GLL{nh_poly + 1}() + topology = Topologies.DistributedTopology2D( + comms_ctx, + mesh, + Topologies.spacefillingcurve(mesh), + ) + space = Spaces.SpectralElementSpace2D( + topology, + quad; + ) + return space +end + + +# MRE: +FT = Float32 +device = ClimaComms.device() +comms_ctx = ClimaComms.context(device) +ClimaComms.init(comms_ctx) +space = make_space(comms_ctx, FT) + +# Create a field of 0s in top half, 1s in bottom half +f = Fields.ones(space) +ClimaComms.allowscalar(device) do + dims = size(parent(f)) + m = dims[1] + parent(f)[1:(m ÷ 2), :, :, :] .= FT(0) +end + +value₀ = FT(0) +value₁ = FT(1) + +dl = Fields.field_values(f) +pf = parent(f) +fv = Fields.field_values(f) + +local i +ClimaComms.allowscalar(device) do + us = DataLayouts.universal_size(dl) + ctu = CartesianIndices(map(x -> Base.OneTo(x), us)) # CartesianIndex((Ni,Nj,Nf,Nv,Nh)) + # ctp = CartesianIndices(map(x -> Base.OneTo(x), size(pf))) # CartesianIndex((Nv,Ni,Nj,Nf,Nh)) --> giving CartesianIndex((Ni, Nj, Nf/Nv, Nh)) instead + i = findfirst(ctu) do I + # IP = CartesianIndex((I.I[4],I.I[1],I.I[2],I.I[3],I.I[5])) + IP = CartesianIndex((I.I[1],I.I[2],I.I[3],I.I[5])) + fv[I]==value₀ && pf[IP]==value₁ + end + @show i + i +end + + +# Notes: +# land +# ocean +# ice + +# f = land .+ ocean .+ ice + +# pf = parent(land) .+ parent(ocean) .+ parent(ice) # any difference? +# pf = parent(land .+ ocean .+ ice) # any difference? + +# cond = minimum(f) == value₀ +# cond = minimum(parent(f)) == value₁ +# bad_found = minimum(f) == value₀ && minimum(parent(f)) == value₁ && value₀ ≠ value₁ + +# Example: +# julia> size(Fields.field_values(cfield)) +# (4, 4, 1, 64, 5400) + +# julia> DataLayouts.universal_size(Fields.field_values(cfield)) +# (4, 4, 16, 64, 5400) + +# julia> size(parent(Fields.field_values(cfield))) +# (64, 4, 4, 16, 5400) + + +# Next step: +# g = similar(f) +# parent(g) .= CUDA.rand(...) +# ClimaComms.allowscalar(device) do +# g[i] = value₀ +# end + +# f = land .+ ocean .+ ice +# dl = Fields.field_values(f) +# pf = parent(land) .+ parent(ocean) .+ parent(ice) + +# cond = minimum(g) == value₀ +# cond = minimum(parent(g)) == value₁ From d0560a1dfece823c5973367927f2ac58961e45b2 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 16:32:29 -0700 Subject: [PATCH 14/16] add mre to pipeline --- .buildkite/pipeline.yml | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index c8002ffed..5ff25db26 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -439,6 +439,24 @@ steps: - group: "GPU integration tests" steps: + - label: "GPU MRE" + key: "gpu_mre" + command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ debug_2gpu.jl" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 16GB + - label: "2 GPU MRE" + key: "gpu_2_mre" + command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ debug_2gpu.jl" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 16GB + + - label: "GPU AMIP with diagnostic EDMF" key: "gpu_amip_diagedmf" command: "srun julia --threads=3 --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/run_amip.jl --config_file $BENCHMARK_CONFIG_PATH/amip_diagedmf.yml --job_id gpu_amip_diagedmf" @@ -446,7 +464,7 @@ steps: agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 - slurm_ntasks: 4 + slurm_ntasks: 1 slurm_mem: 16GB - label: "2 GPU AMIP with diagnostic EDMF" key: "gpu_2_amip_diagedmf" From d8ecc5e7ccdbb550677424fdbf5e1dc7a3397dfb Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 21 Jun 2024 17:33:16 -0700 Subject: [PATCH 15/16] unify configs --- .../gpu_amip_albedo_temporal_map.yml | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/config/ci_configs/gpu_amip_albedo_temporal_map.yml b/config/ci_configs/gpu_amip_albedo_temporal_map.yml index 236ad5d42..b1c4de853 100644 --- a/config/ci_configs/gpu_amip_albedo_temporal_map.yml +++ b/config/ci_configs/gpu_amip_albedo_temporal_map.yml @@ -2,20 +2,23 @@ FLOAT_TYPE: "Float32" alpha_rayleigh_uh: 0 alpha_rayleigh_w: 10 apply_limiter: false -dt: "150secs" -dt_cpl: 150 +dt: "120secs" +dt_cloud_fraction: 1hours +dt_cpl: 120 dt_rad: "1hours" -dt_save_to_sol: "1days" -dz_bottom: 30 -dz_top: 3000 +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +dz_bottom: 30.0 +dz_top: 3000.0 h_elem: 30 land_albedo_type: "map_temporal" mode_name: "amip" moist: "equil" precip_model: "0M" -rad: "gray" -rayleigh_sponge: true -t_end: "300secs" +rad: "allskywithclear" +surface_setup: DefaultMoninObukhov +t_end: "12hours" +use_coupler_diagnostics: false vert_diff: "true" -z_elem: 50 -z_stretch: false +z_elem: 63 +z_max: 55000.0 From 6b29feea6e142180feb0a6e3b2a326b57fbe8246 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Mon, 24 Jun 2024 09:20:54 -0700 Subject: [PATCH 16/16] use random values --- debug_2gpu.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/debug_2gpu.jl b/debug_2gpu.jl index a01cb6b3c..3c42a9e28 100644 --- a/debug_2gpu.jl +++ b/debug_2gpu.jl @@ -39,9 +39,10 @@ space = make_space(comms_ctx, FT) # Create a field of 0s in top half, 1s in bottom half f = Fields.ones(space) ClimaComms.allowscalar(device) do - dims = size(parent(f)) - m = dims[1] - parent(f)[1:(m ÷ 2), :, :, :] .= FT(0) + parent(f) .= rand() + # dims = size(parent(f)) + # m = dims[1] + # parent(f)[1:(m ÷ 2), :, :, :] .= FT(0) end value₀ = FT(0)