Octopus
kmeans_clustering.F90
Go to the documentation of this file.
1!! Copyright (C) 2024. A Buccheri.
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17
18#include "global.h"
19
21 use, intrinsic :: iso_fortran_env
22 use debug_oct_m
23 use global_oct_m
24 use mesh_oct_m
26 use mpi_oct_m
30 use space_oct_m
31 use sort_oct_m
33 implicit none
34 private
35
40
41contains
42
43 ! TODO(Alex) Issue #1004. Implement `assign_points_to_centroids` for periodic boundary conditions
44
74 subroutine assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
75 class(mesh_t), intent(in) :: mesh
76 real(real64), intent(in) :: centroids(:, :)
77 integer, intent(out) :: ip_to_ic(:)
78
79 integer :: ip, ic, icen
80 integer :: n_centroids
81 real(real64) :: min_dist, dist
82 real(real64), allocatable :: point(:)
83 real(real64), parameter :: tol = 1.0e-13_real64
84 ! required to distinguish degenerate points, else `if (dist < min_dist)` can vary on
85 ! different hardware due to numerical noise. One could equally choose tol to be some
86 ! percentage of the grid spacing.
87
89
90 ! Grid to centroid index map should have size of the grid
91 assert(size(ip_to_ic) == mesh%np)
92
93 n_centroids = size(centroids, 2)
94 safe_allocate(point(1:size(centroids, 1)))
95
96 !$omp parallel do default(shared) private(point, icen, min_dist, dist)
97 do ip = 1, mesh%np
98 ip_to_ic(ip) = 0
99 ! Compute which centroid, grid point `ip` is closest to
100 point = mesh%x(:, ip)
101 icen = 1
102 min_dist = sum((centroids(:, 1) - point(:))**2)
103 do ic = 2, n_centroids
104 dist = sum((centroids(:, ic) - point(:))**2)
105 if (dist < min_dist - tol) then
106 min_dist = dist
107 icen = ic
108 endif
109 enddo
110 ip_to_ic(ip) = icen
111 enddo
112 !$omp end parallel do
113
114 safe_deallocate_a(point)
117
119
120
133 subroutine update_centroids(mesh, weight, ip_to_ic, centroids)
134 class(mesh_t), intent(in) :: mesh
135 real(real64), intent(in) :: weight(:)
136 integer, intent(in) :: ip_to_ic(:)
137 real(real64), contiguous, intent(inout) :: centroids(:, :)
138 ! Out: Updated centroid positions
139
140 integer :: n_centroids, ic, ip
141 real(real64) :: one_over_denom
142 real(real64), allocatable :: denominator(:)
143
144 push_sub(update_centroids)
145
146 ! The indexing of weight and grid must be consistent => must belong to the same spatial distribution
147 ! This does not explicit assert this, but if the grid sizes differ, this is a clear indicator of a problem
148 assert(mesh%np == size(weight))
149
150 n_centroids = size(centroids, 2)
151 safe_allocate(denominator(1:n_centroids))
152
153 do ic = 1, n_centroids
154 centroids(:, ic) = 0._real64
155 denominator(ic) = 0._real64
156 enddo
157
158 !$omp parallel do private(ic) reduction(+ : centroids, denominator)
159 do ip = 1, mesh%np
160 ic = ip_to_ic(ip)
161 ! Initially accumulate the numerator in `centroids`
162 centroids(:, ic) = centroids(:, ic) + (mesh%x(:, ip) * weight(ip))
163 denominator(ic) = denominator(ic) + weight(ip)
164 enddo
165 !$omp end parallel do
166
167 ! Gather contributions to numerator and denominator of all centroids, from all domains of the mesh/grid
168 call mesh%allreduce(centroids)
169 call mesh%allreduce(denominator)
170
171 ! If division by zero occurs here it implies that the sum(weight) = 0 for all grid points
172 ! in cluster ic. This can occur if the initial centroid is poorly chosen at a point with no
173 !! associated weight (such as the vacuum of a crystal cell)
174 !$omp parallel do private(one_over_denom) reduction(* : centroids)
175 do ic = 1, n_centroids
176 one_over_denom = 1._real64 / denominator(ic)
177 centroids(:, ic) = centroids(:, ic) * one_over_denom
178 enddo
179 !$omp end parallel do
180
181 safe_deallocate_a(denominator)
182 pop_sub(update_centroids)
183
184 end subroutine update_centroids
185
186
191 subroutine compute_grid_difference(points, updated_points, tol, points_differ)
192 real(real64), intent(in) :: points(:, :)
193 real(real64), intent(in) :: updated_points(:, :)
194 real(real64), intent(in) :: tol
195 logical, intent(out) :: points_differ(:)
196
197 integer :: ip, n_dim
198 real(real64), allocatable :: diff(:)
199
201
202 n_dim = size(points, 1)
203 allocate(diff(n_dim))
204
205 !$omp parallel do default(shared) private(diff)
206 do ip = 1, size(points, 2)
207 diff(:) = abs(updated_points(:, ip) - points(:, ip))
208 points_differ(ip) = any(diff > tol)
209 enddo
210 !$omp end parallel do
211
212 if(debug%info) then
213 call report_differences_in_grids(points, updated_points, tol, points_differ)
214 endif
215
217
218 end subroutine compute_grid_difference
219
220
222 subroutine report_differences_in_grids(points, updated_points, tol, points_differ)
223 real(real64), intent(in) :: points(:, :)
224 real(real64), intent(in) :: updated_points(:, :)
225 real(real64), intent(in) :: tol
226 logical, intent(in) :: points_differ(:)
227
228 integer, allocatable :: indices(:)
229 integer :: i, j, n_unconverged, ndim
230 character(len=50) :: f_string
231 real(real64), allocatable :: diff(:)
232
234
235 indices = pack([(i, i=1,size(points_differ))], points_differ)
236 n_unconverged = size(indices)
237 ndim = size(points, 1)
238 allocate(diff(ndim))
239
240 write(f_string, '(A, I1, A, I1, A, I1, A)') '(', &
241 & ndim, '(F16.10, X), ', &
242 & ndim, '(F16.10, X), ', &
243 & ndim, '(F16.10, X), F16.10)'
244
245 write(message(1), '(a)') "# Current Point , Prior Point , |ri - r_{i-1}| , tol"
246 call messages_info(1)
247 do j = 1, n_unconverged
248 i = indices(j)
249 diff(:) = abs(updated_points(:, i) - points(:, i))
250 write(message(1), f_string) updated_points(:, i), points(:, i), diff, tol
251 call messages_info(1)
252 enddo
253 write(message(1), *) "Summary:", n_unconverged, "of out", size(points, 2), "are not converged"
254 call messages_info(1)
255
257
258 end subroutine report_differences_in_grids
259
260
290 subroutine weighted_kmeans(space, mesh, weight, centroids, n_iter, centroid_tol, discretize, inertia)
291 class(space_t), intent(in) :: space
292 class(mesh_t), intent(in) :: mesh
293 real(real64), intent(in) :: weight(:)
294 real(real64), contiguous, intent(inout) :: centroids(:, :)
295 ! Out: Final centroids
296 integer, optional, intent(in ) :: n_iter
297 real(real64), optional, intent(in ) :: centroid_tol
298 logical, optional, intent(in ) :: discretize
299 real(real64), optional, intent(out) :: inertia
300
301 logical :: discretize_centroids
302 integer :: n_iterations, n_centroid, i
303 real(real64) :: tol
304 integer, allocatable :: ip_to_ic(:)
305 real(real64), allocatable :: prior_centroids(:, :)
306 logical, allocatable :: points_differ(:)
307
308 push_sub(weighted_kmeans)
309
310 n_iterations = optional_default(n_iter, 200)
311 tol = optional_default(centroid_tol, 1.e-4_real64)
312 discretize_centroids = optional_default(discretize, .true.)
313
314 ! Should use a positive number of iterations
315 assert(n_iterations >= 1)
316 ! Number of weights inconsistent with number of grid points
317 assert(size(weight) == mesh%np)
318 ! Spatial dimensions of centroids array is inconsistent
319 assert(size(centroids, 1) == space%dim)
320 ! Assignment of points to centroids only implemented for finite BCs
321 assert(.not. space%is_periodic())
322
323 ! Work arrays
324 n_centroid = size(centroids, 2)
325 safe_allocate_source(prior_centroids(space%dim, size(centroids, 2)), centroids)
326 safe_allocate(ip_to_ic(1:mesh%np))
327 safe_allocate(points_differ(1:n_centroid))
328
329 write(message(1), '(a)') 'Debug: Performing weighted Kmeans clustering '
330 call messages_info(1, debug_only=.true.)
331
332 do i = 1, n_iterations
333 write(message(1), '(a, I3)') 'Debug: Iteration ', i
334 call messages_info(1, debug_only=.true.)
335 ! TODO(Alex) Issue #1004. Implement `assign_points_to_centroids` for periodic boundary conditions
336 call assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
337
338 call update_centroids(mesh, weight, ip_to_ic, centroids)
339 call compute_grid_difference(prior_centroids, centroids, tol, points_differ)
340
341 if (any(points_differ)) then
342 prior_centroids = centroids
343 else
344 write(message(1), '(a)') 'Debug: All centroid points converged'
345 call messages_info(1, debug_only=.true.)
346 ! Break loop
347 exit
348 endif
349
350 enddo
351
352 if (discretize_centroids) then
353 call mesh_discretize_values_to_mesh(mesh, centroids)
354 endif
355
356 if (present(inertia)) then
357 call compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
358 endif
359
360 safe_deallocate_a(prior_centroids)
361 safe_deallocate_a(ip_to_ic)
362 safe_deallocate_a(points_differ)
363
364 pop_sub(weighted_kmeans)
365
366 end subroutine weighted_kmeans
367
368
373 subroutine sample_initial_centroids(mesh, centroids, seed_value)
374 class(mesh_t), intent(in ) :: mesh
375 real(real64), contiguous, intent(out) :: centroids(:, :)
376 integer(int64), intent(inout), optional :: seed_value
377 ! This will get mutated by the Fisher Yates shuffle
378
379 integer(int32) :: n_centroids
380 integer(int64), allocatable :: centroid_idx(:)
381 integer(int64) :: ipg
382 integer(int32) :: ic
383
386 n_centroids = size(centroids, 2)
387 safe_allocate(centroid_idx(1:n_centroids))
388
389 ! Choose n_centroids indices from [1, np_global]
390 call fisher_yates_shuffle(n_centroids, mesh%np_global, seed_value, centroid_idx)
391
392 ! Convert ip_global to (x,y,z)
393 do ic = 1, n_centroids
394 ipg = centroid_idx(ic)
395 centroids(:, ic) = mesh_x_global(mesh, ipg)
396 enddo
397
398 safe_deallocate_a(centroid_idx)
399
401
402 end subroutine sample_initial_centroids
403
404
414 subroutine compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
415 class(mesh_t), intent(in) :: mesh
416 real(real64), intent(in) :: centroids(:, :)
417 real(real64), intent(in) :: weight(:)
418 integer, intent(in) :: ip_to_ic(:)
419 real(real64), intent(out) :: inertia
420 integer :: ip, ic
421
422 inertia = 0.0_real64
423
424 !$omp parallel do private(ip) reduction(+ : inertia)
425 do ip = 1, mesh%np
426 ic = ip_to_ic(ip)
427 inertia = inertia + weight(ip) * sum((centroids(:, ic) - mesh%x(:, ip))**2)
428 enddo
429 !$omp end parallel do
430
431 call mesh%allreduce(inertia)
432
433 end subroutine compute_centroid_inertia
434
436
437!! Local Variables:
438!! mode: f90
439!! coding: utf-8
440!! End:
type(debug_t), save, public debug
Definition: debug.F90:158
subroutine report_differences_in_grids(points, updated_points, tol, points_differ)
Report differences returned from compute_grid_difference.
subroutine, public weighted_kmeans(space, mesh, weight, centroids, n_iter, centroid_tol, discretize, inertia)
Weighted K-means clustering.
subroutine, public sample_initial_centroids(mesh, centroids, seed_value)
Sample initial centroids from the full mesh.
subroutine, public assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
Assign each grid point to the closest centroid. A centroid and its set of nearest grid points defines...
subroutine, public update_centroids(mesh, weight, ip_to_ic, centroids)
Compute a new set of centroids.
subroutine compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
Compute the inertia of all centroids.
subroutine compute_grid_difference(points, updated_points, tol, points_differ)
Compute the difference in two grids as .
This module defines the meshes, which are used in Octopus.
Definition: mesh.F90:120
subroutine, public mesh_discretize_values_to_mesh(mesh, values)
Assign a set of values to their nearest discrete points on the mesh.
Definition: mesh.F90:416
real(real64) function, dimension(1:mesh%box%dim), public mesh_x_global(mesh, ipg)
Given a global point index, this function returns the coordinates of the point.
Definition: mesh.F90:817
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:594
subroutine, public fisher_yates_shuffle(m, n, seed, values)
Return m random numbers from a range of with no replacement.
Definition: quickrnd.F90:309
This module is intended to contain "only mathematical" functions and procedures.
Definition: sort.F90:119
Describes mesh distribution to nodes.
Definition: mesh.F90:187
int true(void)