From d3d8c548223ec79987d7326c808f830bc3f65813 Mon Sep 17 00:00:00 2001
From: Michael Bien <mbien@fh-landshut.de>
Date: Sat, 12 Feb 2011 21:00:02 +0100
Subject: improved HelloJOCL sample. more comments, more printouts, try-finally
 best practice, max WGS agnostic

---
 .../jogamp/opencl/demos/hellojocl/HelloJOCL.java   | 111 ++++++++++++---------
 1 file changed, 64 insertions(+), 47 deletions(-)

(limited to 'src/com/jogamp/opencl/demos')

diff --git a/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java b/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
index 1daf890..70900eb 100644
--- a/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
+++ b/src/com/jogamp/opencl/demos/hellojocl/HelloJOCL.java
@@ -3,6 +3,7 @@ package com.jogamp.opencl.demos.hellojocl;
 import com.jogamp.opencl.CLBuffer;
 import com.jogamp.opencl.CLCommandQueue;
 import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
 import com.jogamp.opencl.CLKernel;
 import com.jogamp.opencl.CLProgram;
 import java.io.IOException;
@@ -11,6 +12,7 @@ import java.util.Random;
 
 import static java.lang.System.*;
 import static com.jogamp.opencl.CLMemory.Mem.*;
+import static java.lang.Math.*;
 
 /**
  * Hello Java OpenCL example. Adds all elements of buffer A to buffer B
@@ -23,63 +25,78 @@ public class HelloJOCL {
 
     public static void main(String[] args) throws IOException {
 
-        int elementCount = 11444777;                                // Length of arrays to process
-        int localWorkSize = 256;                                    // Local work size dimensions
-        int globalWorkSize = roundUp(localWorkSize, elementCount);  // rounded up to the nearest multiple of the localWorkSize
-
-        // set up
+        // set up (uses default CLPlatform and creates context for all devices)
         CLContext context = CLContext.create();
-
-        CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
-
-        CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
-        CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
-        CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
-
-        out.println("used device memory: "
-            + (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB");
-
-        // fill read buffers with random numbers (just to have test data; seed is fixed -> results will not change between runs).
-        fillBuffer(clBufferA.getBuffer(), 12345);
-        fillBuffer(clBufferB.getBuffer(), 67890);
-
-        // get a reference to the kernel functon with the name 'VectorAdd'
-        // and map the buffers to its input parameters.
-        CLKernel kernel = program.createCLKernel("VectorAdd");
-        kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
-
-        // create command queue on fastest device.
-        CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
-
-        // asynchronous write of data to GPU device, blocking read later to get the computed results back.
-        long time = nanoTime();
-        queue.putWriteBuffer(clBufferA, false)
-             .putWriteBuffer(clBufferB, false)
-             .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
-             .putReadBuffer(clBufferC, true);
-        time = nanoTime() - time;
-
-        // cleanup all resources associated with this context.
-        context.release();
-
-        // print first few elements of the resulting buffer to the console.
-        out.println("a+b=c results snapshot: ");
-        for(int i = 0; i < 10; i++)
-            out.print(clBufferC.getBuffer().get() + ", ");
-        out.println("...; " + clBufferC.getBuffer().remaining() + " more");
-
-        out.println("computation took: "+(time/1000000)+"ms");
+        out.println("created "+context);
+        
+        // always make sure to release the context under all circumstances
+        // not needed for this particular sample but recommented
+        try{
+            
+            // select fastest device
+            CLDevice device = context.getMaxFlopsDevice();
+            out.println("using "+device);
+
+            // create command queue on device.
+            CLCommandQueue queue = device.createCommandQueue();
+
+            int elementCount = 1444477;                                  // Length of arrays to process
+            int localWorkSize = min(device.getMaxWorkGroupSize(), 256);  // Local work size dimensions
+            int globalWorkSize = roundUp(localWorkSize, elementCount);   // rounded up to the nearest multiple of the localWorkSize
+
+            // load sources, create and build program
+            CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
+
+            // A, B are input buffers, C is for the result
+            CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
+            CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
+            CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
+
+            out.println("used device memory: "
+                + (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB");
+
+            // fill input buffers with random numbers
+            // (just to have test data; seed is fixed -> results will not change between runs).
+            fillBuffer(clBufferA.getBuffer(), 12345);
+            fillBuffer(clBufferB.getBuffer(), 67890);
+
+            // get a reference to the kernel function with the name 'VectorAdd'
+            // and map the buffers to its input parameters.
+            CLKernel kernel = program.createCLKernel("VectorAdd");
+            kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
+
+            // asynchronous write of data to GPU device,
+            // followed by blocking read to get the computed results back.
+            long time = nanoTime();
+            queue.putWriteBuffer(clBufferA, false)
+                 .putWriteBuffer(clBufferB, false)
+                 .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
+                 .putReadBuffer(clBufferC, true);
+            time = nanoTime() - time;
+
+            // print first few elements of the resulting buffer to the console.
+            out.println("a+b=c results snapshot: ");
+            for(int i = 0; i < 10; i++)
+                out.print(clBufferC.getBuffer().get() + ", ");
+            out.println("...; " + clBufferC.getBuffer().remaining() + " more");
+
+            out.println("computation took: "+(time/1000000)+"ms");
+            
+        }finally{
+            // cleanup all resources associated with this context.
+            context.release();
+        }
 
     }
 
-    private static final void fillBuffer(FloatBuffer buffer, int seed) {
+    private static void fillBuffer(FloatBuffer buffer, int seed) {
         Random rnd = new Random(seed);
         while(buffer.remaining() != 0)
             buffer.put(rnd.nextFloat()*100);
         buffer.rewind();
     }
 
-    private static final int roundUp(int groupSize, int globalSize) {
+    private static int roundUp(int groupSize, int globalSize) {
         int r = globalSize % groupSize;
         if (r == 0) {
             return globalSize;
-- 
cgit v1.2.3