JuliaMolSim
diff --git a/‎.github/workflows/CI.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/CI.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Project.toml‎
Lines changed: 6 additions & 1 deletion b/‎Project.toml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmarks.jl‎
Lines changed: 29 additions & 36 deletions b/‎benchmark/benchmarks.jl‎
Lines changed: 29 additions & 36 deletions
diff --git a/‎benchmark/protein.jl‎
Lines changed: 15 additions & 15 deletions b/‎benchmark/protein.jl‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎docs/src/documentation.md‎
Lines changed: 19 additions & 8 deletions b/‎docs/src/documentation.md‎
Lines changed: 19 additions & 8 deletions
@@ -29,6 +29,7 @@ jobs:
           - NotGradients
           - Gradients
     steps:
+      - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
 
@@ -2,7 +2,7 @@
 *.jl.*.cov
 *.jl.mem
 docs/build
-/Manifest.toml
+*Manifest.toml
 benchmark/tune.json
 benchmark/results
 .vscode/settings.json
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
 Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
@@ -32,13 +33,15 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [extensions]
+MollyCUDAExt = "CUDA"
 MollyEnzymeExt = "Enzyme"
 MollyGLMakieExt = ["GLMakie", "Colors"]
 MollyKernelDensityExt = "KernelDensity"
@@ -61,7 +64,9 @@ Enzyme = "0.13.20"
 EzXML = "1"
 FLoops = "0.2"
 GLMakie = "0.8, 0.9, 0.10, 0.11"
+GPUArrays = "11"
 Graphs = "1.8"
+KernelAbstractions = "0.9"
 KernelDensity = "0.5, 0.6"
 LinearAlgebra = "1.9"
 NearestNeighbors = "0.4"
 
@@ -34,7 +34,7 @@ Implemented features include:
 - [Unitful.jl](https://github.com/PainterQubits/Unitful.jl) compatibility so numbers have physical meaning.
 - Set up crystal systems using [SimpleCrystals.jl](https://github.com/ejmeitz/SimpleCrystals.jl).
 - Automatic multithreading.
-- GPU acceleration on CUDA-enabled devices.
+- GPU acceleration on all backends supported by [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl), with better performance on CUDA-enabled devices.
 - Run with Float64, Float32 or other float types.
 - Some analysis functions, e.g. RDF.
 - Visualise simulations as animations with [Makie.jl](https://makie.juliaplots.org/stable).
 
@@ -17,15 +17,15 @@ else
     @warn "The parallel benchmarks will not be run as Julia is running on 1 thread"
 end
 
-# Allow CUDA device to be specified
-const DEVICE = get(ENV, "DEVICE", "0")
+# Allow GPU device to be specified
+const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
 
-const run_gpu_tests = CUDA.functional()
-if run_gpu_tests
-    device!(parse(Int, DEVICE))
-    @info "The GPU benchmarks will be run on device $DEVICE"
+const run_cuda_tests = CUDA.functional()
+if run_cuda_tests
+    device!(DEVICE)
+    @info "The CUDA benchmarks will be run on device $DEVICE"
 else
-    @warn "The GPU benchmarks will not be run as a CUDA-enabled device is not available"
+    @warn "The CUDA benchmarks will not be run as a CUDA-enabled device is not available"
 end
 
 const SUITE = BenchmarkGroup(
@@ -62,7 +62,7 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
 const starting_coords_f32 = [Float32.(c) for c in starting_coords]
 const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+function test_sim(nl::Bool, parallel::Bool, f32::Bool, ::Type{AT}) where AT
     n_atoms = 400
     n_steps = 200
     atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,34 +72,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
     bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
     specific_inter_lists = (InteractionList2Atoms(
-        gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-        gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-        gpu ? CuArray(bonds) : bonds,
+        AT(Int32.(collect(1:2:n_atoms))),
+        AT(Int32.(collect(2:2:n_atoms))),
+        AT(bonds),
     ),)
 
     neighbor_finder = NoNeighborFinder()
     cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
     pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
     if nl
         neighbor_finder = DistanceNeighborFinder(
-            eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            eligible=AT(trues(n_atoms, n_atoms)),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
         )
         pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
     end
 
-    if gpu
-        coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-        velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-        atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                              ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-    else
-        coords = copy(f32 ? starting_coords_f32 : starting_coords)
-        velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-        atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                      ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-    end
+    coords = AT(copy(f32 ? starting_coords_f32 : starting_coords))
+    velocities = AT(copy(f32 ? starting_velocities_f32 : starting_velocities))
+    atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                     ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
     sys = System(
         atoms=atoms,
@@ -117,22 +110,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
 end
 
 runs = [
-    ("CPU"       , [false, false, false, false]),
-    ("CPU f32"   , [false, false, true , false]),
-    ("CPU NL"    , [true , false, false, false]),
-    ("CPU f32 NL", [true , false, true , false]),
+    ("CPU"       , [false, false, false, Array]),
+    ("CPU f32"   , [false, false, true , Array]),
+    ("CPU NL"    , [true , false, false, Array]),
+    ("CPU f32 NL", [true , false, true , Array]),
 ]
 if run_parallel_tests
-    push!(runs, ("CPU parallel"       , [false, true , false, false]))
-    push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-    push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-    push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+    push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+    push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+    push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+    push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
 end
-if run_gpu_tests
-    push!(runs, ("GPU"       , [false, false, false, true]))
-    push!(runs, ("GPU f32"   , [false, false, true , true]))
-    push!(runs, ("GPU NL"    , [true , false, false, true]))
-    push!(runs, ("GPU f32 NL", [true , false, true , true]))
+if run_cuda_tests
+    push!(runs, ("GPU"       , [false, false, false, CuArray]))
+    push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+    push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+    push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
 end
 
 for (name, args) in runs
 
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
 const ff_dir = joinpath(data_dir, "force_fields")
 const openmm_dir = joinpath(data_dir, "openmm_6mrr")
 
-function setup_system(gpu::Bool, f32::Bool, units::Bool)
+function setup_system(::Type{AT}, f32::Bool, units::Bool) where AT
     T = f32 ? Float32 : Float64
     ff = MolecularForceField(
         T,
@@ -27,9 +27,9 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
     sys = System(
         joinpath(data_dir, "6mrr_equil.pdb"),
         ff;
-        velocities=gpu ? CuArray(velocities) : velocities,
+        velocities=AT(velocities),
         units=units,
-        gpu=gpu,
+        array_type=AT,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
         dist_neighbors=(units ? dist_neighbors * u"nm" : dist_neighbors),
     )
@@ -41,21 +41,21 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
 end
 
 runs = [
-    # run_name                             gpu    parr   f32    units
-    ("CPU 1 thread"                      , false, false, false, true ),
-    ("CPU 1 thread f32"                  , false, false, true , true ),
-    ("CPU 1 thread f32 nounits"          , false, false, true , false),
-    ("CPU $n_threads threads"            , false, true , false, true ),
-    ("CPU $n_threads threads f32"        , false, true , true , true ),
-    ("CPU $n_threads threads f32 nounits", false, true , true , false),
-    ("GPU"                               , true , false, false, true ),
-    ("GPU f32"                           , true , false, true , true ),
-    ("GPU f32 nounits"                   , true , false, true , false),
+    # run_name                             gpu      parr   f32    units
+    ("CPU 1 thread"                      , Array  , false, false, true ),
+    ("CPU 1 thread f32"                  , Array  , false, true , true ),
+    ("CPU 1 thread f32 nounits"          , Array  , false, true , false),
+    ("CPU $n_threads threads"            , Array  , true , false, true ),
+    ("CPU $n_threads threads f32"        , Array  , true , true , true ),
+    ("CPU $n_threads threads f32 nounits", Array  , true , true , false),
+    ("GPU"                               , CuArray, false, false, true ),
+    ("GPU f32"                           , CuArray, false, true , true ),
+    ("GPU f32 nounits"                   , CuArray, false, true , false),
 ]
 
-for (run_name, gpu, parallel, f32, units) in runs
+for (run_name, AT, parallel, f32, units) in runs
     n_threads_used = parallel ? n_threads : 1
-    sys, sim = setup_system(gpu, f32, units)
+    sys, sim = setup_system(AT, f32, units)
     simulate!(deepcopy(sys), sim, 20; n_threads=n_threads_used)
     println(run_name)
     @time simulate!(sys, sim, n_steps; n_threads=n_threads_used)
 
@@ -135,11 +135,21 @@ visualize(sys.loggers.coords, boundary, "sim_lj.mp4")
 
 ## GPU acceleration
 
-To run simulations on the GPU you will need to have a CUDA-compatible device.
-[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is used to run on the device.
+To run simulations on the GPU you will need to have a GPU available and then load the appropriate package:
+
+| Hardware Available | Necessary Package | Array Type |
+| ------------------ | ----------------- | ---------- |
+| Parallel CPU       | none              | `Array`    |
+| NVIDIA GPU         | CUDA              | `CuArray`  |
+| AMD GPU            | AMDGPU            | `ROCArray` |
+| Intel GPU          | oneAPI            | `oneArray` |
+| Apple Silicon      | Metal             | `MtlArray` |
+
+As an important note, Metal/Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary.
 Simulation setup is similar to above, but with the coordinates, velocities and atoms moved to the GPU.
 This example also shows setting up a simulation to run with `Float32`, which gives much better performance on GPUs.
 Of course, you will need to determine whether this level of numerical accuracy is appropriate in your case.
+Here is an example script for an NVIDIA GPU using CUDA:
 ```julia
 using Molly
 using CUDA
@@ -168,6 +178,7 @@ sys = System(
 simulate!(deepcopy(sys), simulator, 20) # Compile function
 simulate!(sys, simulator, 1_000)
 ```
+To use another GPU package, just swap out `CUDA` for your desired package and `CuArray` for your desired array type.
 The device to run on can be changed with `device!`, e.g. `device!(1)`.
 The GPU code path is currently designed to be compatible with differentiable simulation and runs slower than related software, but this is an active area of development.
 Nonetheless, GPU performance is significantly better than CPU performance and is good enough for many applications.
@@ -316,7 +327,7 @@ sys = System(
         energy=TotalEnergyLogger(10),
         writer=StructureWriter(10, "traj_6mrr_1ps.pdb", ["HOH"]),
     ),
-    gpu=false,
+    array_type=Array,
 )
 
 minimizer = SteepestDescentMinimizer()
@@ -352,7 +363,7 @@ Residue patches, virtual sites, file includes and any force types other than `Ha
 
     Some PDB files that read in fine can be found [here](https://github.com/greener-group/GB99dms/tree/main/structures/training/conf_1).
 
-To run on the GPU, set `gpu=true`.
+To run on the GPU, set `array_type=GPUArrayType`, where `GPUArrayType` is the array type for your GPU backend (for example `CuArray` for NVIDIA or `ROCArray` for AMD).
 You can use an implicit solvent method by giving the `implicit_solvent` keyword argument to [`System`](@ref).
 The options are `"obc1"`, `"obc2"` and `"gbn2"`, corresponding to the Onufriev-Bashford-Case GBSA model with parameter set I or II and the GB-Neck2 model.
 Other options include overriding the boundary dimensions in the file (`boundary`) and modifying the non-bonded interaction and neighbor list cutoff distances (`dist_cutoff` and `dist_neighbors`).
@@ -1017,10 +1028,10 @@ function Molly.simulate!(sys::ReplicaSystem,
 end
 ```
 
-Under the hood there are two implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, and a version geared towards GPUs.
-You can define different versions of a simulator for CPU and GPU systems by dispatching on `System{D, false}` or `System{D, true}` respectively.
+Under the hood there are multiple implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, a CUDA version, and a version for other GPU backends.
+You can define different versions of a simulator for CPU, CUDA and generic GPU systems by dispatching on `System{D, Array}` or `System{D, CuArray}` and `System{D, AT} where AT <: AbstractGPUArray` respectively.
 This also applies to coupling methods, neighbor finders and analysis functions.
-You do not have to define two versions though: you may only intend to use the simulator one way, or one version may be performant in all cases.
+You do not have to define different versions though: you may only intend to use the simulator one way, or one version may be performant in all cases.
 
 ## Coupling
 
@@ -1321,7 +1332,7 @@ The available neighbor finders are:
 - [`DistanceNeighborFinder`](@ref)
 - [`TreeNeighborFinder`](@ref)
 
-The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU and [`GPUNeighborFinder`](@ref) on GPU.
+The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU, [`GPUNeighborFinder`](@ref) on NVIDIA GPUs and [`DistanceNeighborFinder`](@ref) on other GPUs.
 When using a neighbor finder you should in general also use an interaction cutoff (see [Cutoffs](@ref)) with a cutoff distance less than the neighbor finder distance.
 The difference between the two should be larger than an atom can move in the time of the `n_steps` defined by the neighbor finder.
 The exception is [`GPUNeighborFinder`](@ref), which uses the algorithm from [Eastman and Pande 2010](https://doi.org/10.1002/jcc.21413) to avoid calculating a neighbor list and should have `dist_cutoff` set to the interaction cutoff distance.