Octopus
kmeans_clustering.F90
Go to the documentation of this file.
1!! Copyright (C) 2024. A Buccheri.
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17
18#include "global.h"
19
21 use, intrinsic :: iso_fortran_env
22 use debug_oct_m
24 use global_oct_m
25 use mesh_oct_m
27 use mpi_oct_m
32 use space_oct_m
33 use sort_oct_m
35 implicit none
36 private
37
42
43contains
44
45 ! TODO(Alex) Issue #1004. Implement `assign_points_to_centroids` for periodic boundary conditions
46
76 subroutine assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
77 class(mesh_t), intent(in) :: mesh
78 real(real64), intent(in) :: centroids(:, :)
79 integer, intent(out) :: ip_to_ic(:)
80
81 integer :: ip, ic, icen
82 integer :: n_centroids
83 real(real64) :: min_dist, dist
84 real(real64), allocatable :: point(:)
85 real(real64), parameter :: tol = 1.0e-13_real64
86 ! required to distinguish degenerate points, else `if (dist < min_dist)` can vary on
87 ! different hardware due to numerical noise. One could equally choose tol to be some
88 ! percentage of the grid spacing.
89
91
92 ! Grid to centroid index map should have size of the grid
93 assert(size(ip_to_ic) == mesh%np)
94
95 n_centroids = size(centroids, 2)
96 safe_allocate(point(1:size(centroids, 1)))
97
98 !$omp parallel do default(shared) private(point, icen, min_dist, dist)
99 do ip = 1, mesh%np
100 ip_to_ic(ip) = 0
101 ! Compute which centroid, grid point `ip` is closest to
102 point = mesh%x(ip, :)
103 icen = 1
104 min_dist = sum((centroids(:, 1) - point(:))**2)
105 do ic = 2, n_centroids
106 dist = sum((centroids(:, ic) - point(:))**2)
107 if (dist < min_dist - tol) then
108 min_dist = dist
109 icen = ic
110 endif
111 enddo
112 ip_to_ic(ip) = icen
113 enddo
114 !$omp end parallel do
115
116 safe_deallocate_a(point)
117
119
121
122
135 subroutine update_centroids(mesh, weight, ip_to_ic, centroids)
136 class(mesh_t), intent(in) :: mesh
137 real(real64), intent(in) :: weight(:)
138 integer, intent(in) :: ip_to_ic(:)
139 real(real64), contiguous, intent(inout) :: centroids(:, :)
140 ! Out: Updated centroid positions
141
142 integer :: n_centroids, ic, ip
143 real(real64) :: one_over_denom
144 real(real64), allocatable :: denominator(:)
145
146 push_sub(update_centroids)
147
148 ! The indexing of weight and grid must be consistent => must belong to the same spatial distribution
149 ! This does not explicit assert this, but if the grid sizes differ, this is a clear indicator of a problem
150 assert(mesh%np == size(weight))
151
152 n_centroids = size(centroids, 2)
153 safe_allocate(denominator(1:n_centroids))
154
155 do ic = 1, n_centroids
156 centroids(:, ic) = 0._real64
157 denominator(ic) = 0._real64
158 enddo
159
160 !$omp parallel do private(ic) reduction(+ : centroids, denominator)
161 do ip = 1, mesh%np
162 ic = ip_to_ic(ip)
163 ! Initially accumulate the numerator in `centroids`
164 centroids(:, ic) = centroids(:, ic) + (mesh%x(ip, :) * weight(ip))
165 denominator(ic) = denominator(ic) + weight(ip)
166 enddo
167 !$omp end parallel do
168
169 ! Gather contributions to numerator and denominator of all centroids, from all domains of the mesh/grid
170 call mesh%allreduce(centroids)
171 call mesh%allreduce(denominator)
172
173 ! If division by zero occurs here it implies that the sum(weight) = 0 for all grid points
174 ! in cluster ic. This can occur if the initial centroid is poorly chosen at a point with no
175 !! associated weight (such as the vacuum of a crystal cell)
176 !$omp parallel do private(one_over_denom) reduction(* : centroids)
177 do ic = 1, n_centroids
178 one_over_denom = 1._real64 / denominator(ic)
179 centroids(:, ic) = centroids(:, ic) * one_over_denom
180 enddo
181 !$omp end parallel do
182
183 safe_deallocate_a(denominator)
184 pop_sub(update_centroids)
185
186 end subroutine update_centroids
187
188
193 subroutine compute_grid_difference(points, updated_points, tol, points_differ)
194 real(real64), intent(in) :: points(:, :)
195 real(real64), intent(in) :: updated_points(:, :)
196 real(real64), intent(in) :: tol
197 logical, intent(out) :: points_differ(:)
198
199 integer :: ip, n_dim
200 real(real64), allocatable :: diff(:)
201
203
204 n_dim = size(points, 1)
205 allocate(diff(n_dim))
206
207 !$omp parallel do default(shared) private(diff)
208 do ip = 1, size(points, 2)
209 diff(:) = abs(updated_points(:, ip) - points(:, ip))
210 points_differ(ip) = any(diff > tol)
211 enddo
212 !$omp end parallel do
213
214 if(debug%info) then
215 call report_differences_in_grids(points, updated_points, tol, points_differ)
216 endif
217
219
220 end subroutine compute_grid_difference
221
222
224 subroutine report_differences_in_grids(points, updated_points, tol, points_differ)
225 real(real64), intent(in) :: points(:, :)
226 real(real64), intent(in) :: updated_points(:, :)
227 real(real64), intent(in) :: tol
228 logical, intent(in) :: points_differ(:)
229
230 integer, allocatable :: indices(:)
231 integer :: i, j, n_unconverged, ndim
232 character(len=50) :: f_string
233 real(real64), allocatable :: diff(:)
234
236
237 indices = pack([(i, i=1,size(points_differ))], points_differ)
238 n_unconverged = size(indices)
239 ndim = size(points, 1)
240 allocate(diff(ndim))
241
242 write(f_string, '(A, I1, A, I1, A, I1, A)') '(', &
243 & ndim, '(F16.10, X), ', &
244 & ndim, '(F16.10, X), ', &
245 & ndim, '(F16.10, X), F16.10)'
246
247 write(message(1), '(a)') "# Current Point , Prior Point , |ri - r_{i-1}| , tol"
248 call messages_info(1)
249 do j = 1, n_unconverged
250 i = indices(j)
251 diff(:) = abs(updated_points(:, i) - points(:, i))
252 write(message(1), f_string) updated_points(:, i), points(:, i), diff, tol
253 call messages_info(1)
254 enddo
255 write(message(1), *) "Summary:", n_unconverged, "of out", size(points, 2), "are not converged"
256 call messages_info(1)
257
259
260 end subroutine report_differences_in_grids
261
262
292 subroutine weighted_kmeans(space, mesh, weight, centroids, n_iter, centroid_tol, discretize, inertia)
293 class(space_t), intent(in) :: space
294 class(mesh_t), intent(in) :: mesh
295 real(real64), intent(in) :: weight(:)
296 real(real64), contiguous, intent(inout) :: centroids(:, :)
297 ! Out: Final centroids
298 integer, optional, intent(in ) :: n_iter
299 real(real64), optional, intent(in ) :: centroid_tol
300 logical, optional, intent(in ) :: discretize
301 real(real64), optional, intent(out) :: inertia
302
303 logical :: discretize_centroids
304 integer :: n_iterations, n_centroid, i
305 real(real64) :: tol
306 integer, allocatable :: ip_to_ic(:)
307 real(real64), allocatable :: prior_centroids(:, :)
308 logical, allocatable :: points_differ(:)
309
310 push_sub(weighted_kmeans)
311
312 n_iterations = optional_default(n_iter, 200)
313 tol = optional_default(centroid_tol, 1.e-4_real64)
314 discretize_centroids = optional_default(discretize, .true.)
315
316 ! Should use a positive number of iterations
317 assert(n_iterations >= 1)
318 ! Number of weights inconsistent with number of grid points
319 assert(size(weight) == mesh%np)
320 ! Spatial dimensions of centroids array is inconsistent
321 assert(size(centroids, 1) == space%dim)
322 ! Assignment of points to centroids only implemented for finite BCs
323 assert(.not. space%is_periodic())
324
325 ! Work arrays
326 n_centroid = size(centroids, 2)
327 safe_allocate_source(prior_centroids(space%dim, size(centroids, 2)), centroids)
328 safe_allocate(ip_to_ic(1:mesh%np))
329 safe_allocate(points_differ(1:n_centroid))
330
331 write(message(1), '(a)') 'Debug: Performing weighted Kmeans clustering '
332 call messages_info(1, debug_only=.true.)
333
334 do i = 1, n_iterations
335 write(message(1), '(a, I3)') 'Debug: Iteration ', i
336 call messages_info(1, debug_only=.true.)
337 ! TODO(Alex) Issue #1004. Implement `assign_points_to_centroids` for periodic boundary conditions
338 call assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
339
340 call update_centroids(mesh, weight, ip_to_ic, centroids)
341 call compute_grid_difference(prior_centroids, centroids, tol, points_differ)
342
343 if (any(points_differ)) then
344 prior_centroids = centroids
345 else
346 write(message(1), '(a)') 'Debug: All centroid points converged'
347 call messages_info(1, debug_only=.true.)
348 ! Break loop
349 exit
350 endif
351
352 enddo
353
354 if (discretize_centroids) then
355 call mesh_discretize_values_to_mesh(mesh, centroids)
356 endif
357
358 if (present(inertia)) then
359 call compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
360 endif
361
362 safe_deallocate_a(prior_centroids)
363 safe_deallocate_a(ip_to_ic)
364 safe_deallocate_a(points_differ)
365
366 pop_sub(weighted_kmeans)
367
368 end subroutine weighted_kmeans
369
370
375 subroutine sample_initial_centroids(mesh, centroids, seed_value)
376 class(mesh_t), intent(in ) :: mesh
377 real(real64), contiguous, intent(out) :: centroids(:, :)
378 integer(int64), intent(inout), optional :: seed_value
379 ! This will get mutated by the Fisher Yates shuffle
380
381 integer(int32) :: n_centroids
382 integer(int64), allocatable :: centroid_idx(:)
383 integer(int64) :: ipg
384 integer(int32) :: ic
387
388 n_centroids = size(centroids, 2)
389 safe_allocate(centroid_idx(1:n_centroids))
390
391 ! Choose n_centroids indices from [1, np_global]
392 call fisher_yates_shuffle(n_centroids, mesh%np_global, seed_value, centroid_idx)
393
394 ! Convert ip_global to (x,y,z)
395 do ic = 1, n_centroids
396 ipg = centroid_idx(ic)
397 centroids(:, ic) = mesh_x_global(mesh, ipg)
398 enddo
399
400 safe_deallocate_a(centroid_idx)
401
403
404 end subroutine sample_initial_centroids
405
406
416 subroutine compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
417 class(mesh_t), intent(in) :: mesh
418 real(real64), intent(in) :: centroids(:, :)
419 real(real64), intent(in) :: weight(:)
420 integer, intent(in) :: ip_to_ic(:)
421 real(real64), intent(out) :: inertia
422 integer :: ip, ic
423
424 inertia = 0.0_real64
425
426 !$omp parallel do private(ip) reduction(+ : inertia)
427 do ip = 1, mesh%np
428 ic = ip_to_ic(ip)
429 inertia = inertia + weight(ip) * sum((centroids(:, ic) - mesh%x(ip, :))**2)
430 enddo
431 !$omp end parallel do
432
433 call mesh%allreduce(inertia)
434
435 end subroutine compute_centroid_inertia
436
438
439!! Local Variables:
440!! mode: f90
441!! coding: utf-8
442!! End:
type(debug_t), save, public debug
Definition: debug.F90:156
subroutine report_differences_in_grids(points, updated_points, tol, points_differ)
Report differences returned from compute_grid_difference.
subroutine, public weighted_kmeans(space, mesh, weight, centroids, n_iter, centroid_tol, discretize, inertia)
Weighted K-means clustering.
subroutine, public sample_initial_centroids(mesh, centroids, seed_value)
Sample initial centroids from the full mesh.
subroutine, public assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
Assign each grid point to the closest centroid. A centroid and its set of nearest grid points defines...
subroutine, public update_centroids(mesh, weight, ip_to_ic, centroids)
Compute a new set of centroids.
subroutine compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
Compute the inertia of all centroids.
subroutine compute_grid_difference(points, updated_points, tol, points_differ)
Compute the difference in two grids as .
This module defines the meshes, which are used in Octopus.
Definition: mesh.F90:118
subroutine, public mesh_discretize_values_to_mesh(mesh, values)
Assign a set of values to their nearest discrete points on the mesh.
Definition: mesh.F90:412
real(real64) function, dimension(1:mesh%box%dim), public mesh_x_global(mesh, ipg)
Definition: mesh.F90:812
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:616
This module contains some common usage patterns of MPI routines.
Definition: mpi_lib.F90:115
subroutine, public fisher_yates_shuffle(m, n, seed, values)
Return m random numbers from a range of with no replacement.
Definition: quickrnd.F90:309
This module is intended to contain "only mathematical" functions and procedures.
Definition: sort.F90:117
Describes mesh distribution to nodes.
Definition: mesh.F90:186
int true(void)