Test with a worksharing strategy (based on TBB task scheduler) + test with parallel_do.

2012-04-10 13:23:51 +00:00 · 2012-04-10 13:23:51 +00:00 · 69272e4d9a
parent 57471588b2
commit 69272e4d9a
12 changed files with 745 additions and 86 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -2709,6 +2709,7 @@ Mesh_3/examples/Mesh_3/old_stuff/inputs/cube.mesh -text svneol=unset#application
 Mesh_3/examples/Mesh_3/old_stuff/inputs/tangle.mesh -text svneol=unset#application/octet-stream
 Mesh_3/include/CGAL/Mesh_3/Locking_data_structures.h -text
 Mesh_3/include/CGAL/Mesh_3/Profiling_tools.h -text
+Mesh_3/include/CGAL/Mesh_3/Worksharing_data_structures.h -text
 Mesh_3/include/CGAL/Meshes/Filtered_multimap_container.h -text
 Mesh_3/include/CGAL/Triangulation_lazy_ds_cell_base_3.h -text
 Mesh_3/package_info/Mesh_3/description.txt -text
--- a/Mesh_2/include/CGAL/Mesher_level.h
+++ b/Mesh_2/include/CGAL/Mesher_level.h
@ -26,24 +26,28 @@
 #endif

 #ifdef CONCURRENT_MESH_3
-  #include <algorithm>
+# include <algorithm>

-  #include <tbb/tbb.h>
+# include <tbb/tbb.h>

-  #include <CGAL/hilbert_sort.h> //CJTODO: remove?
-  #include <CGAL/spatial_sort.h> //CJTODO: remove?
-  #include <CGAL/Mesh_3/Locking_data_structures.h> // CJODO TEMP?
-  #include <CGAL/BBox_3.h>
+# include <CGAL/hilbert_sort.h> //CJTODO: remove?
+# include <CGAL/spatial_sort.h> //CJTODO: remove?
+# include <CGAL/Mesh_3/Locking_data_structures.h> // CJODO TEMP?
+# ifdef CGAL_MESH_3_WORKSHARING_USES_TASKS
+#   include <CGAL/Mesh_3/Worksharing_data_structures.h>
+#   include <tbb/task.h>
+# endif
+# include <CGAL/BBox_3.h>
   
-  #ifdef CGAL_CONCURRENT_MESH_3_PROFILING
-    #define CGAL_PROFILE
-    #include <CGAL/Profile_counter.h>
-  #endif
+# ifdef CGAL_CONCURRENT_MESH_3_PROFILING
+#   define CGAL_PROFILE
+#   include <CGAL/Profile_counter.h>
+# endif
  
  // CJTODO TEMP TEST
-#ifdef CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
+# ifdef CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
  extern bool g_is_set_cell_active;
-#endif
+# endif

  // CJTODO TEMP: not thread-safe => move it to Mesher_3
  extern CGAL::Bbox_3 g_bbox;
@ -545,6 +549,11 @@ public:
    typedef typename Derived::Container::Element Container_element;
    typedef typename Derived::Container::Quality Container_quality;

+  //=======================================================
+  //================= PARALLEL_FOR?
+  //=======================================================
+
+# ifdef CGAL_MESH_3_WORKSHARING_USES_PARALLEL_FOR
    /*std::pair<Container_quality, Container_element>
      raw_elements[ELEMENT_BATCH_SIZE];*/
    std::vector<Container_element> container_elements;
@ -573,9 +582,9 @@ public:
      indices.push_back(iElt);
    }

-# ifdef CGAL_CONCURRENT_MESH_3_VERBOSE
+#   ifdef CGAL_CONCURRENT_MESH_3_VERBOSE
    std::cerr << "Refining a batch of " << iElt << " elements...";
-# endif
+#   endif
    
    // Doesn't help much
    //typedef Spatial_sort_traits_adapter_3<Tr::Geom_traits, Point*> Search_traits;
@ -599,8 +608,6 @@ public:
        {
          for( size_t i = r.begin() ; i != r.end() ; )
          {
-            before_next_element_refinement(visitor);
-
            std::ptrdiff_t index = indices[i];
            Container_element ce = container_elements[index];

@ -616,7 +623,6 @@ public:
                break;

              case COULD_NOT_LOCK_ZONE:
-              case COULD_NOT_LOCK_ELEMENT:
              {
                // Swap indices[i] and indices[i+1]
                if (i+1 != r.end())
@ -634,11 +640,15 @@ public:
                break;
              }
              
+              case COULD_NOT_LOCK_ELEMENT:
+                // We retry it now
              case THE_FACET_TO_REFINE_IS_NOT_IN_ITS_CONFLICT_ZONE:
                // We retry it since we switched to exact computation
                // for the adjacent cells circumcenters
                break;
            }
+            
+            before_next_element_refinement(visitor);
          }
        }
      );
@ -681,9 +691,195 @@ public:
      }
    }

-# ifdef CGAL_CONCURRENT_MESH_3_VERBOSE
-      std::cerr << " batch done." << std::endl;
-# endif
+#   ifdef CGAL_CONCURRENT_MESH_3_VERBOSE
+    std::cerr << " batch done." << std::endl;
+#   endif
+      
+  //=======================================================
+  //================= PARALLEL_DO?
+  //=======================================================
+
+# elif defined(CGAL_MESH_3_WORKSHARING_USES_PARALLEL_DO)
+    std::vector<Container_element> container_elements;
+    container_elements.reserve(ELEMENT_BATCH_SIZE);
+    
+    while(!no_longer_element_to_refine())
+    {
+      Container_element ce = derived().get_next_raw_element_impl().second;
+      pop_next_element();
+      container_elements.push_back(ce);
+    }
+
+#   ifdef CGAL_CONCURRENT_MESH_3_VERBOSE
+    std::cerr << "Refining elements in parallel...";
+#   endif
+    
+    // CJTODO: lambda functions OK?
+    
+    //g_is_set_cell_active = false;
+    previous_level.add_to_TLS_lists(true);
+    add_to_TLS_lists(true);
+    tbb::parallel_do(
+      container_elements.begin(), container_elements.end(),
+      [&] (Container_element& ce, tbb::parallel_do_feeder<Container_element>& feeder)
+      {
+        Mesher_level_conflict_status status;
+        do 
+        {
+          status = try_lock_and_refine_element(ce, visitor);
+        }
+        while (status == COULD_NOT_LOCK_ELEMENT 
+          || status == THE_FACET_TO_REFINE_IS_NOT_IN_ITS_CONFLICT_ZONE);
+
+        switch (status)
+        {
+          case NO_CONFLICT:
+          case CONFLICT_AND_ELEMENT_SHOULD_BE_DROPPED:
+          case ELEMENT_WAS_A_ZOMBIE:
+            break;
+
+          case COULD_NOT_LOCK_ZONE:
+          {
+            feeder.add(ce);
+            break;
+          }
+              
+          /*case COULD_NOT_LOCK_ELEMENT:
+            // We retry it now
+          case THE_FACET_TO_REFINE_IS_NOT_IN_ITS_CONFLICT_ZONE:
+            // We retry it since we switched to exact computation
+            // for the adjacent cells circumcenters
+            break;*/
+        }
+        
+        before_next_element_refinement(visitor); 
+
+        // Finally we add the new local bad_elements to the feeder
+        while (no_longer_local_element_to_refine() == false)
+        {
+          typedef typename Derived::Container::Element Container_element;
+          Container_element ce = derived().get_next_local_raw_element_impl().second;
+          pop_next_local_element();
+
+          feeder.add(ce);
+        } 
+      }
+    );
+    splice_local_lists();
+    CGAL_assertion(no_longer_element_to_refine());
+    //previous_level.splice_local_lists(); // useless
+    previous_level.add_to_TLS_lists(false);
+    add_to_TLS_lists(false);
+    //g_is_set_cell_active = true;
+    
+
+#   ifdef CGAL_CONCURRENT_MESH_3_VERBOSE
+    std::cerr << " done." << std::endl;
+#   endif
+  //=======================================================
+  //================= TASKS?
+  //=======================================================
+
+# elif defined(CGAL_MESH_3_WORKSHARING_USES_TASKS)
+
+    std::vector<Container_element> container_elements;
+    container_elements.reserve(ELEMENT_BATCH_SIZE);
+    
+    int iElt = 0;
+    for( ; 
+          iElt < ELEMENT_BATCH_SIZE && !no_longer_element_to_refine() ; 
+          ++iElt )
+    {
+      Container_element ce = derived().get_next_raw_element_impl().second;
+      pop_next_element();
+      container_elements.push_back(ce);
+    }
+    
+#   ifdef CGAL_CONCURRENT_MESH_3_VERBOSE
+    std::cerr << "Refining a batch of " << iElt << " elements...";
+#   endif
+    
+    // CJTODO: lambda functions OK?
+    if (iElt > 20)
+    {
+      //g_is_set_cell_active = false;
+      previous_level.add_to_TLS_lists(true);
+      add_to_TLS_lists(true);
+      
+      tbb::task& empty_root_task = *new( tbb::task::allocate_root() ) tbb::empty_task;
+      empty_root_task.set_ref_count(iElt + 1);
+
+      for( size_t i = 0 ; i < iElt ; ++i)
+      {
+        Container_element ce = container_elements[i];
+        
+        Mesh_3::enqueue_work(
+          [&, ce, visitor]()
+          {
+            Mesher_level_conflict_status status;
+            do
+            {
+              status = try_lock_and_refine_element(ce, visitor);
+              before_next_element_refinement(visitor);
+            }
+            while (status != NO_CONFLICT
+              && status != CONFLICT_AND_ELEMENT_SHOULD_BE_DROPPED
+              && status != ELEMENT_WAS_A_ZOMBIE);
+          },
+          empty_root_task,
+          circumcenter(derived().extract_element_from_container_value(ce)));
+      }
+      empty_root_task.wait_for_all();
+      tbb::task::destroy(empty_root_task);
+
+      splice_local_lists();
+      //previous_level.splice_local_lists(); // useless
+      previous_level.add_to_TLS_lists(false);
+      add_to_TLS_lists(false);
+      //g_is_set_cell_active = true;
+    }
+    // Go sequential
+    else
+    {
+      for (int i = 0 ; i < iElt ; )
+      {
+        std::ptrdiff_t index = i;
+
+        Derived &derivd = derived();
+        //Container_element ce = raw_elements[index].second;
+        Container_element ce = container_elements[index];
+        if( !derivd.is_zombie(ce) )
+        {
+          // Lock the element area on the grid
+          Element element = derivd.extract_element_from_container_value(ce);
+          
+          const Mesher_level_conflict_status result 
+            = try_to_refine_element(element, visitor);
+
+          if (result != CONFLICT_BUT_ELEMENT_CAN_BE_RECONSIDERED
+            && result != THE_FACET_TO_REFINE_IS_NOT_IN_ITS_CONFLICT_ZONE)
+          {
+            ++i;
+          }
+        }
+        else
+        {
+          ++i;
+        }
+        // Unlock
+        unlock_all_thread_local_elements();
+      }
+    }
+
+#   ifdef CGAL_CONCURRENT_MESH_3_VERBOSE
+    std::cerr << " batch done." << std::endl;
+#   endif
+
+#endif
+  //=======================================================
+  //================= / WORKSHARING STRATEGY
+  //=======================================================
+
  }

  /** 
@ -712,36 +908,37 @@ public:
    std::cerr << "Trying to insert point: " << p << std::endl;
 #endif
    
+    
+//=========================================
 //==== Simple Grid locking
+//=========================================
 #if defined(CGAL_MESH_3_CONCURRENT_REFINEMENT) && \
    defined(CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING)

    Mesher_level_conflict_status result;
    Zone zone;
-    if( g_lock_grid.try_lock(p).first )
-    {
-      before_conflicts(e, p, visitor);

-      bool could_lock_zone;
-      bool facet_not_in_its_cz = false;
-      zone = conflicts_zone(p, e, facet_not_in_its_cz, could_lock_zone);
+    before_conflicts(e, p, visitor);
      
-      if (!could_lock_zone)
-        result = COULD_NOT_LOCK_ZONE;
-      else if (facet_not_in_its_cz)
-        result = THE_FACET_TO_REFINE_IS_NOT_IN_ITS_CONFLICT_ZONE;
-      else
-        result = test_point_conflict(p, zone, visitor);
-    }
-    else
-    {
+    bool could_lock_zone;
+    bool facet_not_in_its_cz = false;
+    zone = conflicts_zone(p, e, facet_not_in_its_cz, could_lock_zone);
+      
+    if (!could_lock_zone)
      result = COULD_NOT_LOCK_ZONE;
-    }
+    else if (facet_not_in_its_cz)
+      result = THE_FACET_TO_REFINE_IS_NOT_IN_ITS_CONFLICT_ZONE;
+    else
+      result = test_point_conflict(p, zone, visitor);

-//==== !Simple Grid locking
+//=========================================
+//==== NOT Simple Grid locking
+//=========================================
 #else
    
-    // Concurrent?
+    before_conflicts(e, p, visitor);
+
+    //=========== Concurrent? =============
 #  ifdef CGAL_MESH_3_CONCURRENT_REFINEMENT
    bool could_lock_zone;
    bool facet_not_in_its_cz = false;
@ -754,7 +951,7 @@ public:
    else
      result = test_point_conflict(p, zone, visitor);

-    // ... or not?
+    //=========== or not? =================
 #  else
    bool facet_not_in_its_cz = false;
    Zone zone = conflicts_zone(p, e, facet_not_in_its_cz);
@ -766,6 +963,9 @@ public:
 #  endif

 #endif
+//=========================================
+//==== / Simple Grid locking
+//=========================================
      
 #ifdef CGAL_MESHES_DEBUG_REFINEMENT_POINTS
    std::cerr << "(" << p << ") ";
--- a/Mesh_3/benchmark/Mesh_3/concurrency.cpp
+++ b/Mesh_3/benchmark/Mesh_3/concurrency.cpp
@ -14,7 +14,7 @@
 # define CGAL_MESH_3_CONCURRENT_REFINEMENT
  // In case some code uses CGAL_PROFILE, it needs to be concurrent
 # define CGAL_CONCURRENT_PROFILE
-//# define CGAL_CONCURRENT_MESH_3_VERBOSE
+# define CGAL_CONCURRENT_MESH_3_VERBOSE

  // ==========================================================================
  // Locking strategy
@ -24,10 +24,29 @@
 //#   define CGAL_MESH_3_LOCKING_STRATEGY_CELL_LOCK
 #   define CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING
 //#   define CGAL_MESH_3_CONCURRENT_REFINEMENT_LOCK_ADJ_CELLS
+//#   define CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
+//#   define CGAL_MESH_3_ACTIVATE_GRID_INDEX_CACHE_IN_VERTEX

+//#   define CGAL_MESH_3_WORKSHARING_USES_TASKS
+//#     define CGAL_MESH_3_WORKSHARING_USES_PARALLEL_FOR
+#     define CGAL_MESH_3_WORKSHARING_USES_PARALLEL_DO
+
+#   ifdef CGAL_MESH_3_WORKSHARING_USES_TASKS
+    const int MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS = 25;
+    const int MESH_3_FIRST_GRID_LOCK_RADIUS = 0;
+
+    const int MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS = 2;
+    const int MESH_3_WORK_STATS_GRID_NUM_CELLS = 
+      MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS*
+      MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS*
+      MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS;
+
+#   else
    const int MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS = 30;
    const int MESH_3_FIRST_GRID_LOCK_RADIUS = 2;
    const int MESH_3_REFINEMENT_GRAINSIZE = 10;
+#   endif
+

 #   ifdef CGAL_MESH_3_LOCKING_STRATEGY_CELL_LOCK
 #     include <tbb/recursive_mutex.h>
@ -46,7 +65,7 @@
  // Concurrency Parameters
  // ==========================================================================

-  const size_t ELEMENT_BATCH_SIZE = 10000;
+  const size_t ELEMENT_BATCH_SIZE = 100000;

  // ==========================================================================
  // Profiling
@ -85,8 +104,20 @@ bool g_temp = false;
  Global_mutex_type g_global_mutex; // CJTODO: temporary
  
  // CJTODO TEMP: not thread-safe => move it to Mesher_3
+  
  // Elephant.off => BBox (x,y,z): [ -0.358688, 0.356308 ], [ -0.498433, 0.49535 ], [ -0.298931, 0.298456 ]
-  CGAL::Bbox_3 g_bbox(-0.35, 0.35, -0.5, 0.5, -0.3, 0.3);
+  //const char *INPUT_FILE_NAME = "D:/INRIA/CGAL/workingcopy/Mesh_3/examples/Mesh_3/data/elephant.off";
+  //CGAL::Bbox_3 g_bbox(-0.36, 0.36, -0.5, 0.5, -0.3, 0.3);
+  
+  // Fandisk.off => BBox (x,y,z): [ -0.4603, 0.4603 ], [ -0.254894, 0.25555 ], [ -0.499801, 0.499177 ], 
+  const char *INPUT_FILE_NAME = "D:/INRIA/CGAL/workingcopy/Mesh_3/examples/Mesh_3/data/fandisk.off";
+  CGAL::Bbox_3 g_bbox(-0.47, 0.47, -0.26, 0.26, -0.5, 0.5);
+  
+# ifdef CGAL_MESH_3_WORKSHARING_USES_TASKS
+#   include <CGAL/Mesh_3/Worksharing_data_structures.h> // CJODO TEMP?
+    CGAL::Mesh_3::Worksharing_ds_type g_worksharing_ds;
+# endif
+
 # ifdef CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING
  CGAL::Mesh_3::Refinement_grid_type g_lock_grid(g_bbox, MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS);

@ -172,14 +203,24 @@ bool refine_mesh(const std::string &input_filename)
  // Create domain
  Mesh_domain domain(polyhedron);

+  // Very small elements
  Mesh_parameters params;
  params.facet_angle = 25;
+  params.facet_sizing = 0.001;
+  params.facet_approx = 0.0068;
+  params.tet_shape = 3;
+  params.tet_sizing = 0.001;
+  
+  // Middle-size elements
+  /*Mesh_parameters params;
+  params.facet_angle = 25;
  params.facet_sizing = 0.002;
  params.facet_approx = 0.0068;
-  /*params.tet_shape = 3;
-  params.tet_sizing = 1.;*/
+  params.tet_shape = 3;
+  params.tet_sizing = 0.005;*/

  std::cerr 
+    << "File: " << input_filename << std::endl
    << "Parameters: " << std::endl 
    << params.log() << std::endl;

@ -187,9 +228,9 @@ bool refine_mesh(const std::string &input_filename)
  Mesh_criteria criteria(
    facet_angle=params.facet_angle,
    facet_size=params.facet_sizing,
-    facet_distance=params.facet_approx/*,
+    facet_distance=params.facet_approx,
    cell_size=params.tet_sizing,
-    cell_radius_edge_ratio=params.tet_shape*/
+    cell_radius_edge_ratio=params.tet_shape
  );

  // Mesh generation
@ -219,7 +260,7 @@ int main()
  for(int i = 1 ; ; ++i)
  {
    std::cerr << "Refinement #" << i << "..." << std::endl;
-    refine_mesh("D:/INRIA/CGAL/workingcopy/Mesh_3/examples/Mesh_3/data/elephant.off");
+    refine_mesh(INPUT_FILE_NAME);
    std::cerr << "Refinement #" << i << " done." << std::endl;
    std::cerr << std::endl << "---------------------------------" << std::endl << std::endl;
  }
--- a/Mesh_3/demo/Mesh_3/Mesh_3_plugin.cpp
+++ b/Mesh_3/demo/Mesh_3/Mesh_3_plugin.cpp
@ -34,28 +34,38 @@
 bool g_temp = false;

 #ifdef CONCURRENT_MESH_3
-  #include <CGAL/Mesh_3/Locking_data_structures.h> // CJODO TEMP?
-  #include <CGAL/BBox_3.h>
+
+# include <CGAL/BBox_3.h>
+# include <CGAL/Mesh_3/Locking_data_structures.h> // CJODO TEMP?

  // CJTODO TEMP TEST
-#ifdef CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
+# ifdef CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
  bool g_is_set_cell_active = true;
-#endif
+# endif

  Global_mutex_type g_global_mutex; // CJTODO: temporary

-
-  // CJTODO TEMP: not thread-safe => move it to Mesher_3
  // Elephant.off => BBox (x,y,z): [ -0.358688, 0.356308 ], [ -0.498433, 0.49535 ], [ -0.298931, 0.298456 ]
-  CGAL::Bbox_3 g_bbox(-0.35, 0.35, -0.5, 0.5, -0.3, 0.3);
+  const char *INPUT_FILE_NAME = "D:/INRIA/CGAL/workingcopy/Mesh_3/examples/Mesh_3/data/elephant.off";
+  CGAL::Bbox_3 g_bbox(-0.36, 0.36, -0.5, 0.5, -0.3, 0.3);
+  
+  // Fandisk.off => BBox (x,y,z): [ -0.4603, 0.4603 ], [ -0.254894, 0.25555 ], [ -0.499801, 0.499177 ], 
+  //const char *INPUT_FILE_NAME = "D:/INRIA/CGAL/workingcopy/Mesh_3/examples/Mesh_3/data/fandisk.off";
+  //CGAL::Bbox_3 g_bbox(-0.47, 0.47, -0.26, 0.26, -0.5, 0.5);
+  
+# ifdef CGAL_MESH_3_WORKSHARING_USES_TASKS
+#   include <CGAL/Mesh_3/Worksharing_data_structures.h> // CJODO TEMP?
+    CGAL::Mesh_3::Worksharing_ds_type g_worksharing_ds;
+# endif
+
 # ifdef CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING
-  CGAL::Mesh_3::Refinement_grid_type g_lock_grid(g_bbox, MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS);
+    CGAL::Mesh_3::Refinement_grid_type g_lock_grid(g_bbox, MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS);

 # elif defined(CGAL_MESH_3_LOCKING_STRATEGY_CELL_LOCK)
-# include <utility>
-# include <vector>
-# include <tbb/enumerable_thread_specific.h>
-  tbb::enumerable_thread_specific<std::vector<std::pair<void*, unsigned int> > > g_tls_locked_cells;
+#   include <utility>
+#   include <vector>
+#   include <tbb/enumerable_thread_specific.h>
+    tbb::enumerable_thread_specific<std::vector<std::pair<void*, unsigned int> > > g_tls_locked_cells;
 # endif

 #endif
--- a/Mesh_3/demo/Mesh_3/Mesh_function.h
+++ b/Mesh_3/demo/Mesh_3/Mesh_function.h
@ -37,6 +37,11 @@
 #include "C3t3_type.h"
 #include "Meshing_thread.h"

+// CJTODO TEMP: not thread-safe => move it to Mesher_3
+#include <CGAL/Mesh_3/Locking_data_structures.h> // CJODO TEMP?
+#ifdef CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING
+  extern CGAL::Mesh_3::Refinement_grid_type g_lock_grid;
+#endif

 struct Mesh_parameters
 {
@ -158,6 +163,9 @@ launch()
       ++it )
  {
    Vertex_handle v = c3t3_.triangulation().insert(it->first);
+# ifdef CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING
+    g_lock_grid.unlock_all_tls_locked_cells();
+#endif
    c3t3_.set_dimension(v,2); // by construction, points are on surface
    c3t3_.set_index(v,it->second);
  }
--- a/Mesh_3/demo/Mesh_3/Scene_c3t3_item.cpp
+++ b/Mesh_3/demo/Mesh_3/Scene_c3t3_item.cpp
@ -20,19 +20,30 @@
 #include <QGLViewer/qglviewer.h>

 #ifdef CONCURRENT_MESH_3
-  #include <CGAL/Mesh_3/Locking_data_structures.h> // CJODO TEMP?
-  #include <CGAL/BBox_3.h>
+# include <CGAL/Mesh_3/Locking_data_structures.h> // CJODO TEMP?
+
+# include <CGAL/BBox_3.h>

  // CJTODO TEMP TEST
-#ifdef CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
+# ifdef CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
  bool g_is_set_cell_active = true;
-#endif
+# endif

  //Global_mutex_type g_global_mutex; // CJTODO: temporary
  
-  // CJTODO TEMP: not thread-safe => move it to Mesher_3
  // Elephant.off => BBox (x,y,z): [ -0.358688, 0.356308 ], [ -0.498433, 0.49535 ], [ -0.298931, 0.298456 ]
-  CGAL::Bbox_3 g_bbox(-0.35, 0.35, -0.5, 0.5, -0.3, 0.3);
+  const char *INPUT_FILE_NAME = "D:/INRIA/CGAL/workingcopy/Mesh_3/examples/Mesh_3/data/elephant.off";
+  CGAL::Bbox_3 g_bbox(-0.36, 0.36, -0.5, 0.5, -0.3, 0.3);
+  
+  // Fandisk.off => BBox (x,y,z): [ -0.4603, 0.4603 ], [ -0.254894, 0.25555 ], [ -0.499801, 0.499177 ], 
+  //const char *INPUT_FILE_NAME = "D:/INRIA/CGAL/workingcopy/Mesh_3/examples/Mesh_3/data/fandisk.off";
+  //CGAL::Bbox_3 g_bbox(-0.47, 0.47, -0.26, 0.26, -0.5, 0.5);
+  
+# ifdef CGAL_MESH_3_WORKSHARING_USES_TASKS
+#   include <CGAL/Mesh_3/Worksharing_data_structures.h> // CJODO TEMP?
+    CGAL::Mesh_3::Worksharing_ds_type g_worksharing_ds;
+# endif
+
 # ifdef CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING
  CGAL::Mesh_3::Refinement_grid_type g_lock_grid(g_bbox, MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS);

--- a/Mesh_3/demo/Mesh_3/config.h
+++ b/Mesh_3/demo/Mesh_3/config.h
@ -50,10 +50,28 @@
 #   define CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING
 //#   define CGAL_MESH_3_CONCURRENT_REFINEMENT_LOCK_ADJ_CELLS
 //#   define CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
+//#   define CGAL_MESH_3_ACTIVATE_GRID_INDEX_CACHE_IN_VERTEX
+
+#   define CGAL_MESH_3_WORKSHARING_USES_TASKS
+//#   define CGAL_MESH_3_WORKSHARING_USES_PARALLEL_FOR
+//#   define CGAL_MESH_3_WORKSHARING_USES_PARALLEL_DO
+
+#   ifdef CGAL_MESH_3_WORKSHARING_USES_TASKS
+      const int MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS = 25;
+      const int MESH_3_FIRST_GRID_LOCK_RADIUS = 0;
+
+      const int MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS = 2;
+      const int MESH_3_WORK_STATS_GRID_NUM_CELLS = 
+        MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS*
+        MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS*
+        MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS;
+
+#   else
+      const int MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS = 30;
+      const int MESH_3_FIRST_GRID_LOCK_RADIUS = 2;
+      const int MESH_3_REFINEMENT_GRAINSIZE = 10;
+#   endif

-    const int MESH_3_LOCKING_GRID_NUM_CELLS_PER_AXIS = 30;
-    const int MESH_3_FIRST_GRID_LOCK_RADIUS = 2;
-    const int MESH_3_REFINEMENT_GRAINSIZE = 10;
    
 #   ifdef CGAL_MESH_3_LOCKING_STRATEGY_CELL_LOCK
 #     include <tbb/recursive_mutex.h>
@ -72,7 +90,7 @@
  // Concurrency Parameters
  // ==========================================================================

-  const size_t ELEMENT_BATCH_SIZE = 30000;
+  const size_t ELEMENT_BATCH_SIZE = 100000;

  // ==========================================================================
  // Profiling
--- a/Mesh_3/include/CGAL/Mesh_3/Locking_data_structures.h
+++ b/Mesh_3/include/CGAL/Mesh_3/Locking_data_structures.h
@ -719,9 +719,9 @@ protected:
  TLS_locked_cells                                m_tls_locked_cells;
 };

-typedef Simple_grid_locking_ds Refinement_grid_type;
+//typedef Simple_grid_locking_ds Refinement_grid_type;
 //typedef Simple_grid_locking_ds_with_mutex Refinement_grid_type;
-//typedef Simple_grid_locking_ds_with_thread_ids Refinement_grid_type;
+typedef Simple_grid_locking_ds_with_thread_ids Refinement_grid_type;


 } //namespace Mesh_3
--- a/Mesh_3/include/CGAL/Mesh_3/Worksharing_data_structures.h
+++ b/Mesh_3/include/CGAL/Mesh_3/Worksharing_data_structures.h
@ -0,0 +1,322 @@
+// Copyright (c) 2012  INRIA Sophia-Antipolis (France).
+// All rights reserved.
+//
+// This file is part of CGAL (www.cgal.org).
+// You can redistribute it and/or modify it under the terms of the GNU
+// General Public License as published by the Free Software Foundation,
+// either version 3 of the License, or (at your option) any later version.
+//
+// Licensees holding a valid commercial license may use this file in
+// accordance with the commercial license agreement provided with the software.
+//
+// This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
+// WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// $URL: $
+// $Id: $
+//
+// Author(s)     : Clement Jamin
+
+#ifdef CONCURRENT_MESH_3
+
+#ifndef CGAL_MESH_3_WORKSHARING_DATA_STRUCTURES_H
+#define CGAL_MESH_3_WORKSHARING_DATA_STRUCTURES_H
+
+#include <CGAL/Bbox_3.h>
+
+#include <tbb/concurrent_queue.h>
+#include <tbb/task.h>
+
+// CJTODO TEMP: not thread-safe => move it to Mesher_3
+extern CGAL::Bbox_3 g_bbox;
+
+namespace CGAL {
+namespace Mesh_3 {
+
+// Forward declarations
+class Dynamic_load_based_worksharing_ds;
+// Typedef
+typedef Dynamic_load_based_worksharing_ds Worksharing_ds_type;
+
+
+
+class Work_statistics
+{
+public:
+  // Constructors
+  
+  Work_statistics(const Bbox_3 &bbox, 
+                  int num_grid_cells_per_axis)
+    : m_num_grid_cells_per_axis(num_grid_cells_per_axis)
+  {
+    m_laziest_cell_index = 0;
+    m_laziest_cell_occupation = 1000;
+
+    int num_cells =
+      num_grid_cells_per_axis*num_grid_cells_per_axis*num_grid_cells_per_axis;
+    m_occupation_grid = new tbb::atomic<int>[num_cells];
+    // Initialize grid
+    for (int i = 0 ; i < num_cells ; ++i)
+      m_occupation_grid[i] = 0;
+
+    // Keep mins and resolutions
+    m_xmin = bbox.xmin();
+    m_ymin = bbox.ymin();
+    m_zmin = bbox.zmin();
+    double n = static_cast<double>(num_grid_cells_per_axis);
+    m_resolution_x = n / (bbox.xmax() - m_xmin);
+    m_resolution_y = n / (bbox.ymax() - m_ymin);
+    m_resolution_z = n / (bbox.zmax() - m_zmin);
+  }
+
+  /// Destructor
+  ~Work_statistics()
+  {
+    delete [] m_occupation_grid;
+  }
+
+  void add_occupation(int cell_index, int to_add, int num_items_in_work_queue)
+  {
+    int new_occupation = 
+      (m_occupation_grid[cell_index].fetch_and_add(to_add)) 
+      + to_add;
+
+    // If this cell is the current most lazy, update the value
+    if (cell_index == m_laziest_cell_index)
+    {
+      if (num_items_in_work_queue == 0)
+        // So that it won't stay long the laziest
+        m_laziest_cell_occupation = 999999;
+      else
+        m_laziest_cell_occupation = new_occupation;
+    }
+    else if (num_items_in_work_queue > 0 
+      && new_occupation <= m_laziest_cell_occupation)
+    {
+      m_laziest_cell_index = cell_index;
+      m_laziest_cell_occupation = new_occupation;
+    }
+  }
+  
+  void add_occupation(int index_x, int index_y, int index_z, 
+                      int to_add, int num_items_in_work_queue)
+  {
+    int index = 
+      index_z*m_num_grid_cells_per_axis*m_num_grid_cells_per_axis
+      + index_y*m_num_grid_cells_per_axis 
+      + index_x;
+    return add_occupation(index, to_add, num_items_in_work_queue);
+  }
+  
+  /// P3 must provide .x(), .y(), .z()
+  template <typename P3>
+  int compute_index(const P3 &point)
+  {
+    // Compute indices on grid
+    int index_x = static_cast<int>( (to_double(point.x()) - m_xmin) * m_resolution_x);
+    index_x = std::max( 0, std::min(index_x, m_num_grid_cells_per_axis - 1) );
+    int index_y = static_cast<int>( (to_double(point.y()) - m_ymin) * m_resolution_y);
+    index_y = std::max( 0, std::min(index_y, m_num_grid_cells_per_axis - 1) );
+    int index_z = static_cast<int>( (to_double(point.z()) - m_zmin) * m_resolution_z);
+    index_z = std::max( 0, std::min(index_z, m_num_grid_cells_per_axis - 1) );
+    
+    int index = 
+      index_z*m_num_grid_cells_per_axis*m_num_grid_cells_per_axis
+      + index_y*m_num_grid_cells_per_axis 
+      + index_x;
+
+    return index;
+  }
+
+  /// P3 must provide .x(), .y(), .z()
+  // Returns index in grid
+  template <typename P3>
+  int add_occupation(const P3 &point, int to_add, int num_items_in_work_queue)
+  {
+    int index = compute_index(point);
+    add_occupation(index, to_add, num_items_in_work_queue);
+    return index;
+  }
+
+  int get_laziest_cell_index()
+  {
+    return m_laziest_cell_index;
+  }
+  
+protected:
+  int                                             m_num_grid_cells_per_axis;
+  double                                          m_xmin;
+  double                                          m_ymin;
+  double                                          m_zmin;
+  double                                          m_resolution_x;
+  double                                          m_resolution_y;
+  double                                          m_resolution_z;
+  tbb::atomic<int> *                              m_occupation_grid;
+
+  tbb::atomic<int>                                m_laziest_cell_index;
+  tbb::atomic<int>                                m_laziest_cell_occupation;
+};
+
+
+/* 
+ * ==============
+ * class WorkItem
+ * Abstract base class for a piece of work.
+ * ==============
+ */
+class WorkItem 
+{
+public:
+  WorkItem() {}
+  // Derived class defines the actual work.
+  virtual void run() = 0;
+  virtual void set_index(int) = 0;
+  virtual int get_index() const = 0;
+};
+
+template<typename Func>
+class ConcreteWorkItem
+  : public WorkItem
+{
+public:
+  ConcreteWorkItem(const Func& func)
+    : m_func(func), m_index(-1)
+  {}
+  
+  void run() 
+  {
+    m_func();
+    delete this;
+  }
+  
+  void set_index(int index)
+  {
+    m_index = index;
+  }
+
+  int get_index() const
+  {
+    return m_index;
+  }
+
+private:
+  Func  m_func;
+  int   m_index;
+};
+
+
+
+/* 
+ * =================
+ * class RunWorkItem
+ * =================
+ */
+class RunWorkItem
+  : public tbb::task 
+{
+public:
+  RunWorkItem() {}
+
+private:
+  /*override*/inline tbb::task* execute();
+};
+
+
+
+/* 
+ * =======================================
+ * class Dynamic_load_based_worksharing_ds
+ * =======================================
+ */
+class Dynamic_load_based_worksharing_ds
+{
+public:
+  // Constructors
+  Dynamic_load_based_worksharing_ds()
+    : m_stats(g_bbox, MESH_3_WORK_STATS_GRID_NUM_CELLS_PER_AXIS)
+  {
+    for (int i = 0 ; i < MESH_3_WORK_STATS_GRID_NUM_CELLS ; ++i)
+      m_num_items[i] = 0;
+  }
+
+  /// Destructor
+  ~Dynamic_load_based_worksharing_ds()
+  {
+  }
+
+  template <typename P3>
+  void add(WorkItem * p_item, const P3 &point, tbb::task &parent_task)
+  {
+    int index = m_stats.compute_index(point);
+    p_item->set_index(index);
+    m_work_items[index].push(p_item);
+    ++m_num_items[index];
+    // CJTODO: try "spawn" instead of enqueue
+    tbb::task::enqueue(*new(parent_task.allocate_child()) RunWorkItem);
+  }
+
+  void run_next_work_item()
+  {
+    WorkItem *p_item = 0;
+    int index = m_stats.get_laziest_cell_index();
+    bool popped = m_work_items[index].try_pop(p_item);
+    // If queue is empty
+    if (!popped)
+    {
+      // Look for an non-empty queue
+      for (index = 0 ; !popped ; ++index)
+      {
+        CGAL_assertion(index < MESH_3_WORK_STATS_GRID_NUM_CELLS);
+        popped = m_work_items[index].try_pop(p_item);
+      }
+
+      --index;
+    }
+    --m_num_items[index];
+    CGAL_assertion(p_item != 0);
+    m_stats.add_occupation(index, 1, m_num_items[index]);
+    p_item->run();
+    m_stats.add_occupation(index, -1, m_num_items[index]);
+  }
+
+protected:
+  Work_statistics                   m_stats; 
+  tbb::concurrent_queue<WorkItem*>  m_work_items[MESH_3_WORK_STATS_GRID_NUM_CELLS];
+  tbb::atomic<int>                  m_num_items [MESH_3_WORK_STATS_GRID_NUM_CELLS];
+};
+
+
+} //namespace Mesh_3
+} //namespace CGAL
+
+extern CGAL::Mesh_3::Worksharing_ds_type g_worksharing_ds;
+
+namespace CGAL
+{
+namespace Mesh_3
+{
+
+inline tbb::task* RunWorkItem::execute()
+{
+  g_worksharing_ds.run_next_work_item();
+  return NULL;
+}
+
+/* 
+ * =====================
+ * function enqueue_work
+ * =====================
+ */
+template<typename Func, typename P3>
+void enqueue_work(Func f, tbb::task &parent_task, const P3 &point)
+{
+  g_worksharing_ds.add(new ConcreteWorkItem<Func>(f), 
+                       point,
+                       parent_task);
+}
+
+} //namespace Mesh_3
+} //namespace CGAL
+
+#endif // CGAL_MESH_3_WORKSHARING_DATA_STRUCTURES_H
+#endif // CONCURRENT_MESH_3
--- a/STL_Extension/include/CGAL/Concurrent_compact_container.h
+++ b/STL_Extension/include/CGAL/Concurrent_compact_container.h
@ -562,7 +562,8 @@ private:
  }
  
  typedef tbb::enumerable_thread_specific<FreeList> Free_lists;
-  typedef tbb::spin_mutex                           Mutex; // CJTODO: try others
+  typedef tbb::queuing_mutex                        Mutex; // CJTODO: try others
+  //typedef tbb::spin_mutex                           Mutex; // CJTODO: try others

  // We store a vector of pointers to all allocated blocks and their sizes.
  // Knowing all pointers, we don't have to walk to the end of a block to reach
--- a/Triangulation_3/include/CGAL/Triangulation_3.h
+++ b/Triangulation_3/include/CGAL/Triangulation_3.h
@ -523,6 +523,31 @@ public:

 #ifdef CGAL_MESH_3_CONCURRENT_REFINEMENT
  
+  bool try_lock_vertex(Vertex_handle vh, int lock_radius = 0) const
+  {
+#ifdef CGAL_MESH_3_ACTIVATE_GRID_INDEX_CACHE_IN_VERTEX
+    int grid_index = vh->get_grid_index_cache();
+    if (grid_index >= 0)
+    {
+      if (g_lock_grid.try_lock(grid_index, lock_radius))
+      {
+        // Has the cached valeu changed in the meantime?
+        if (vh->get_grid_index_cache() == grid_index)
+          return true;
+      }
+      return false;
+    }
+    else
+    {
+      std::pair<bool, int> r = g_lock_grid.try_lock(vh->point(), lock_radius);
+      vh->set_grid_index_cache(r.second);
+      return r.first;
+    }
+#else
+    return g_lock_grid.try_lock(vh->point(), lock_radius).first;
+#endif
+  }
+
  bool try_lock_element(Cell_handle cell_handle, int lock_radius = 0) const
  {
    bool success = true;
@ -537,10 +562,10 @@ public:
      // We do not lock the infinite vertex
      if (!is_infinite(vh))
      {
-        success = g_lock_grid.try_lock(vh->point(), lock_radius).first;
+        success = try_lock_vertex(vh, lock_radius);
      }
 #   else
-      success = g_lock_grid.try_lock(vh->point(), lock_radius).first;
+      success = try_lock_vertex(vh, lock_radius);
 #   endif
    }
 # elif defined(CGAL_MESH_3_LOCKING_STRATEGY_CELL_LOCK)
@ -556,10 +581,11 @@ public:
 # ifdef CGAL_MESH_3_LOCKING_STRATEGY_SIMPLE_GRID_LOCKING
    // Lock the element area on the grid
    Cell_handle cell = facet.first;
-    for (int iVertex = (facet.second+1)&3 ; success && iVertex != facet.second ; iVertex = (iVertex+1)&3)
+    for (int iVertex = (facet.second+1)&3 ; 
+         success && iVertex != facet.second ; iVertex = (iVertex+1)&3)
    {
      Vertex_handle vh = cell->vertex(iVertex);
-      success = g_lock_grid.try_lock(vh->point(), lock_radius).first;
+      success = try_lock_vertex(vh, lock_radius);
    }
 # elif defined(CGAL_MESH_3_LOCKING_STRATEGY_CELL_LOCK)
    success = facet.first->try_lock(); // CJTODO: we lock the cell => stupid?
--- a/Triangulation_3/include/CGAL/Triangulation_ds_vertex_base_3.h
+++ b/Triangulation_3/include/CGAL/Triangulation_ds_vertex_base_3.h
@ -32,6 +32,10 @@
 # endif
 #endif

+#ifdef CGAL_MESH_3_ACTIVATE_GRID_INDEX_CACHE_IN_VERTEX
+# include <tbb/atomic.h>
+#endif
+
 namespace CGAL {

 template < typename TDS = void >
@ -46,13 +50,16 @@ public:
  struct Rebind_TDS { typedef Triangulation_ds_vertex_base_3<TDS2> Other; };

  
+  Triangulation_ds_vertex_base_3()
+    : _c()
 #ifdef CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
-  Triangulation_ds_vertex_base_3()
-    : _c(), m_visited(false) {}
-#else
-  Triangulation_ds_vertex_base_3()
-    : _c() {}
+    , m_visited(false) 
 #endif
+  {
+#ifdef CGAL_MESH_3_ACTIVATE_GRID_INDEX_CACHE_IN_VERTEX
+    m_grid_index_cache = -1;
+#endif
+  }

  Triangulation_ds_vertex_base_3(Cell_handle c)
    : _c(c) {}
@ -91,6 +98,17 @@ public:
    return cell() != Cell_handle();
  }
  
+#ifdef CGAL_MESH_3_ACTIVATE_GRID_INDEX_CACHE_IN_VERTEX
+  void set_grid_index_cache (int index)
+  {
+    m_grid_index_cache = index;
+  }
+  int get_grid_index_cache()
+  {
+    return m_grid_index_cache;
+  }
+#endif
+
  // For use by the Compact_container.
  void *   for_compact_container() const
  { return _c.for_compact_container(); }
@ -101,6 +119,9 @@ private:

 #ifdef CGAL_MESH_3_DO_NOT_LOCK_INFINITE_VERTEX
  mutable tbb::spin_mutex m_mutex;
+#endif
+#ifdef CGAL_MESH_3_ACTIVATE_GRID_INDEX_CACHE_IN_VERTEX
+  tbb::atomic<int> m_grid_index_cache;
 #endif
  Cell_handle _c;