%~Mouliné par MaN_auto v.0.37.1 (6604bb9b) 2025-11-27 15:12:47
\documentclass[CRMECA,Unicode,biblatex,published]{cedram}

\addbibresource{CRMECA_Andrieu_20250036.bib}

\usepackage{siunitx}
\usepackage{subcaption}
\usepackage{array}
\newcolumntype{K}{>{\centering\arraybackslash}p{.35cm}}

\newcommand*{\bigO}{O}

\newcommand*{\lo}[1]{\ensuremath{\underline{#1}}}
\newcommand*{\hi}[1]{\ensuremath{\widebar{#1}}}
\newcommand*{\aabb}[1]{\lo{\hi{#1}}}

\newcommand*{\myframe}[1]{\ensuremath{\mathcal{#1}}}

\definecolor{rank0}{rgb}{0.984314,0.538824,0.501176}
\definecolor{rank1}{rgb}{0.589020,0.752157,0.890196}
\definecolor{rank2}{rgb}{0.727059,0.921569,0.683137}
\definecolor{rank3}{rgb}{0.856471,0.737255,0.894118}
\definecolor{rank4}{rgb}{0.996078,0.763922,0.443922}
\definecolor{rank5}{rgb}{1.000000,1.000000,0.680000}
\definecolor{rank6}{rgb}{0.898039,0.816471,0.647059}
\definecolor{rank7}{rgb}{0.992157,0.772549,0.885490}

\definecolor{drank0}{rgb}{0.984314,0.271529,0.211294}
\definecolor{drank1}{rgb}{0.408314,0.669333,0.890196}
\definecolor{drank2}{rgb}{0.426039,0.737255,0.355765}
\definecolor{drank3}{rgb}{0.833882,0.643137,0.894118}
\definecolor{drank4}{rgb}{0.996078,0.624627,0.112627}
\definecolor{drank5}{rgb}{1.000000,1.000000,0.488000}
\definecolor{drank6}{rgb}{0.898039,0.767529,0.496471}
\definecolor{drank7}{rgb}{0.992157,0.640784,0.821490}


\tikzset{ img/.style={inner sep=0}, label/.style={inner sep=0, font={\small}} }

%%%---------------------------------------------------------------------------

%%% DELIMITERS

\usepackage{mathtools}
\DeclarePairedDelimiter{\parens}{\lparen}{\rparen}
\DeclarePairedDelimiter{\braces}{\{}{\}}
\DeclarePairedDelimiter{\card}{\lvert}{\rvert}

%%%---------------------------------------------------------------------------

%%% FIGURES

\makeatletter
\renewcommand\thesubfigure{(\alph{subfigure})}
\DeclareCaptionLabelFormat{mysublabelfmt}{(\alph{subfigure})}
\makeatother
\captionsetup[sub]{labelfont=bf,labelsep=space,labelformat=mysublabelfmt}

%%%---------------------------------------------------------------------------

%%%% WIDEBAR

% https://tex.stackexchange.com/questions/16337/can-i-get-a-widebar-without-using-the-mathabx-package
\makeatletter
\let\save@mathaccent\mathaccent
\newcommand*\if@single[3]{%
  \setbox0\hbox{${\mathaccent"0362{#1}}^H$}%
  \setbox2\hbox{${\mathaccent"0362{\kern0pt#1}}^H$}%
  \ifdim\ht0=\ht2 #3\else #2\fi
  }
%The bar will be moved to the right by a half of \macc@kerna, which is computed by amsmath:
\newcommand*\rel@kern[1]{\kern#1\dimexpr\macc@kerna}
%If there's a superscript following the bar, then no negative kern may follow the bar;
%an additional {} makes sure that the superscript is high enough in this case:
\newcommand*\widebar[1]{\@ifnextchar^{{\wide@bar{#1}{0}}}{\wide@bar{#1}{1}}}
%Use a separate algorithm for single symbols:
\newcommand*\wide@bar[2]{\if@single{#1}{\wide@bar@{#1}{#2}{1}}{\wide@bar@{#1}{#2}{2}}}
\newcommand*\wide@bar@[3]{%
  \begingroup
  \def\mathaccent##1##2{%
%Enable nesting of accents:
    \let\mathaccent\save@mathaccent
%If there's more than a single symbol, use the first character instead (see below):
    \if#32 \let\macc@nucleus\first@char \fi
%Determine the italic correction:
    \setbox\z@\hbox{$\macc@style{\macc@nucleus}_{}$}%
    \setbox\tw@\hbox{$\macc@style{\macc@nucleus}{}_{}$}%
    \dimen@\wd\tw@
    \advance\dimen@-\wd\z@
%Now \dimen@ is the italic correction of the symbol.
    \divide\dimen@ 3
    \@tempdima\wd\tw@
    \advance\@tempdima-\scriptspace
%Now \@tempdima is the width of the symbol.
    \divide\@tempdima 10
    \advance\dimen@-\@tempdima
%Now \dimen@ = (italic correction / 3) - (Breite / 10)
    \ifdim\dimen@>\z@ \dimen@0pt\fi
%The bar will be shortened in the case \dimen@<0 !
    \rel@kern{0.6}\kern-\dimen@
    \if#31
      \overline{\rel@kern{-0.6}\kern\dimen@\macc@nucleus\rel@kern{0.4}\kern\dimen@}%
      \advance\dimen@0.4\dimexpr\macc@kerna
%Place the combined final kern (-\dimen@) if it is >0 or if a superscript follows:
      \let\final@kern#2%
      \ifdim\dimen@<\z@ \let\final@kern1\fi
      \if\final@kern1 \kern-\dimen@\fi
    \else
      \overline{\rel@kern{-0.6}\kern\dimen@#1}%
    \fi
  }%
  \macc@depth\@ne
  \let\math@bgroup\@empty \let\math@egroup\macc@set@skewchar
  \mathsurround\z@ \frozen@everymath{\mathgroup\macc@group\relax}%
  \macc@set@skewchar\relax
  \let\mathaccentV\macc@nested@a
%The following initialises \macc@kerna and calls \mathaccent:
  \if#31
    \macc@nested@a\relax111{#1}%
  \else
%If the argument consists of more than one symbol, and if the first token is
%a letter, use that letter for the computations:
    \def\gobble@till@marker##1\endmarker{}%
    \futurelet\first@char\gobble@till@marker#1\endmarker
    \ifcat\noexpand\first@char A\else
      \def\first@char{}%
    \fi
    \macc@nested@a\relax111{\first@char}%
  \fi
  \endgroup
}
\makeatother

%%%---------------------------------------------------------------------------

\graphicspath{{./figures/}}

\newcommand*{\mk}{\mkern -1mu}
\newcommand*{\Mk}{\mkern -2mu}
\newcommand*{\mK}{\mkern 1mu}
\newcommand*{\MK}{\mkern 2mu}

\hypersetup{urlcolor=purple, linkcolor=blue, citecolor=red}

\title{Dynamic load-balanced point location algorithm for data mapping}
\alttitle{Algorithme de localisation de points avec équilibrage de charge dynamique pour le transfert de données}

\author{\firstname{Bastien} \lastname{Andrieu}\CDRorcid{0009-0000-9937-0244}\IsCorresp}
\address{ONERA/DMPE, Université Paris-Saclay, 92320 Châtillon, France}
\email{bastien.andrieu@onera.fr}

\author{\firstname{Bruno} \lastname{Maugars}}
\address{ONERA/DAAA, Université Paris-Saclay, 92320 Châtillon, France}
\email{bruno.maugars@onera.fr}

\author{\firstname{Eric} \lastname{Quémerais}\CDRorcid{0009-0007-2018-6358}}
\address[1]{ONERA/DMPE, Université Paris-Saclay, 92320 Châtillon, France}
\email{eric.quemerais@onera.fr}

\keywords{\kwd{High Performance Computing (HPC)} \kwd{Message Passing Interface (MPI)} \kwd{dynamic load balancing}
\kwd{computational geometry}}
\altkeywords{\kwd{Calcul haute performance (HPC)} \kwd{interface de passage de messages (MPI)} \kwd{équilibrage dynamique de charge} \kwd{géométrie algorithmique}}

\editornote{Article submitted by invitation}
\alteditornote{Article soumis sur invitation}

\begin{abstract}
Data mapping between geometric domains with non-matching discretizations is an essential step in multi-component numerical simulation workflows. This paper presents a novel point location algorithm, designed for transferring data from unstructured meshes to point clouds, in a massively parallel distributed environment. Special emphasis is placed on load balancing, which is paramount for making the most of computing resources and achieve optimal performance. In general, the geometric entities of interest are unevenly distributed in the input frame provided by the calling codes. The algorithm therefore aims to rapidly prune the search space using a series of parallel preconditioning techniques, while redistributing data equitably across all processes at each step. Exact point-in-cell location is then computed in an embarrassingly parallel, well-balanced frame. All data movements performed throughout the point location algorithm are transparent to the calling codes, as the resulting geometric and parallel mappings are returned in the same frame as the input data. These mappings enable data transfer via spatial interpolation and optimized process-to-process communications. A weak scaling study is carried out in three scenarios representative of the variety of real-life applications. Comparison with a state-of-the-art algorithm shows that the new algorithm performs better overall, with speed-ups of up to a factor of 10 on 4,800 CPU cores.
\end{abstract}

\begin{altabstract}
Le transfert de données entre des domaines géométriques avec des discrétisations non conformes est une étape essentielle dans les workflows de simulation numérique multicomposants. Cet article présente un nouvel algorithme de localisation de points, conçu pour transférer des données depuis des maillages non-structurés vers des nuages de points, dans un environnement distribué massivement parallèle. Une attention particulière est accordée à l'équilibrage de charge, qui est primordial pour tirer le meilleur parti des ressources informatiques et obtenir des performances optimales. En général, les entités géométriques d'intérêt sont réparties de manière inégale dans la distribution d'entrée fournie par les codes appelants. L'algorithme vise donc à réduire rapidement l'espace de recherche à l'aide d'une série de techniques de préconditionnement parallèles, tout en redistribuant les données de manière équitable entre tous les processus à chaque étape. La localisation exacte des points dans les cellules est ensuite calculée en parallèle dans une distribution bien équilibrée. Les mouvements de données effectués tout au long de l'algorithme de localisation des points sont transparents pour les codes appelants, car les mappings géométriques et parallèles résultants sont renvoyés dans la même distribution que les données d'entrée. Ces mappings permettent le transfert de données par interpolation spatiale et des communications optimisées entre les processus. Une étude de scalabilité faible est réalisée dans trois scénarios représentatifs de la diversité des applications réelles. La comparaison avec un algorithme de l'état de l'art montre que le nouvel algorithme est globalement plus performant, avec des accélérations pouvant atteindre un facteur 10 sur 4800 cœurs CPU.
\end{altabstract}

\dateposted{2026-02-03}
\begin{document}
%\input{CR-pagedemetas}
%\end{document}
\maketitle

\section{Introduction}

In many scientific and engineering applications, numerical simulations require transferring data between arbitrarily discretized domains. Such applications include code coupling for multi-physics simulations~\cite{fabbri2023design, coria2022modeling}, solution transfer following adaptive remeshing~\cite{alauzet2016parallel}, and Lagrangian particle tracking~\cite{palha2015hybrid}. Depending on the different numerical methods, these spatial discretizations usually consist of meshes or point clouds.

Data transfer from a \emph{source} (donor) domain to a \emph{target} (receiver) domain breaks down to two main steps. First, a mapping between the source and target degrees-of-freedom (DoFs) must be computed. When treating the target DoFs as points, the task consists in identifying which cell of the source mesh contains each of these points. Such points typically correspond to mesh nodes or quadrature points in the Finite Element and Discontinuous Galerkin methods, or cell centers in the Finite Volume method. Second, the source-to-target mapping is used to interpolate and transfer data. The location step is the most computationally intense, but only needs to be performed once at initialization if both source and target geometries remain static during the simulation. However, data transfer between domains with dynamic discretizations requires repeated location computations, making the performance of this operation crucial. Ideally, the time needed for data mapping should be within the same order of magnitude as the time required for a single iteration of a computational code. Naively testing all possible pairs of cells and points results in quadratic complexity, which is prohibitively expensive in practical applications. It is therefore essential to devise efficient preconditioning techniques that eliminate all the irrelevant pairs.

The growing demand for high-fidelity simulations requires an ever-increasing number of degrees-of-freedom, and simulations exceeding a billion DoFs are becoming more prevalent~\cite{roadmap2020}. Such simulations require so much memory and computing power that they can only be achieved on massively parallel distributed-memory systems, using parallel programming models such as the Message Passing Interface (MPI) \cite{the1993mpi}. In this context, the domains between which data is to be transferred are decomposed into partitions distributed across multiple processes.

This parallel context poses an additional challenge. The domain decompositions are tailored for the specific needs of each computational code and are therefore not optimal for the location problem. Since the source and target partitions are unlikely to align, extra communication is necessary to compute point location and exchange interpolated data. To prevent communication latency from becoming a bottleneck, the preconditioning stage must also minimize unnecessary communications. Besides, the geometric entities of interest may be poorly distributed in the decompositions provided as input to the location algorithm. For instance, when transferring data through the common surface between two adjacent volume domains, only a fraction of processes hold a portion of this surface in their partition. If no action is taken to redistribute the workload, this imbalance can severely degrade performance.

The distribution and amount of input data can vary significantly across different data mapping applications. Ensuring good performance regardless of this variability is challenging and developing a single algorithm that runs efficiently in all situations remains an open problem.

The rest of this paper is structured as follows. Section~\ref{sec:related_work} gives a brief overview of existing work on data transfer for massively parallel simulations. A novel parallel point location algorithm tailored for data transfer between distributed meshes and point clouds is then described in Section~\ref{sec:algo}. Finally, a performance study of this algorithm in several test cases representative of the variety of real applications is reported in Section~\ref{sec:perfo}, along with comparisons with another high-performance algorithm.

\section{Related work}\label{sec:related_work}

The \emph{precise Code Interaction Coupling Environment} (preCICE) library~\cite{chourdakis2022precice} is a state-of-the-art open-source coupling library which features multiple data transfer methods. The underlying location algorithm, recently improved by Totounferoush et~al.~\cite{totounferoush2021efficient}, can be broken down into two major steps. The first step consists in comparing the axis-aligned bounding boxes (AABBs) of the mesh partitions owned by each process, in order to establish the pairs of processes from the two coupled codes that will need to communicate. The AABBs are initially collected by a master process for each coupled code. They are then transmitted to the other master process and subsequently broadcast to all the remaining processes involved in the computation. In the second step, source mesh partitions are exchanged to the corresponding target processes using a process-to-process communication pattern. Each process then locates its own target points within the received source mesh partitions. This is performed efficiently using an R-Tree data structure. While showing great improvement over their previous algorithm, some limitations still remain. Most of the total execution time is spent exchanging and comparing the AABBs in the first step and communicating source mesh partitions to the target processes. The load imbalance issue is also not addressed. In fact, the performance studies shown assume meshes uniformly distributed on all processes, which is unlikely in real-life surface coupling applications.

The \emph{Finite Volume Mesh} (FVM) library~\cite{fournier2011optimizing} features location and exchange capabilities, leaving the interpolation step to the calling codes for better genericity. The location algorithm follows essentially the same two steps as preCICE, albeit with noticeable strategic differences. First, the partition AABBs are exchanged via collective communications, thus avoiding the bottleneck inherent to the master-slave approach implemented in preCICE. Second, the location computation is performed on the source side, meaning each process sends its target points to its corresponding source processes. This approach reduces the amount of communication, since smaller messages are needed to exchange points instead of mesh partitions. In order to keep the memory requirements low, blocking send/receive communications are used and the received point clouds are located one at a time. The location step is then accelerated by storing the received points in a local octree. Fournier~\cite{fournier2020massively} notes that this strategy can lead to excessive serialization in worst-case scenarios. To solve this problem, a technique for ordering communicating processes by recursive subdivision is implemented in the \emph{Parallel Location and Exchange} (PLE) library, which is derived from FVM. Fournier also points out that the first coarse-grained filter based on a single AABB per process can lead to the communication of a large number of potentially irrelevant points, due to numerous false positive detections. Assuming a uniformly distributed point cloud, the number of points received by each process is correlated to the volume of its AABB. Yet this volume highly depends on the shape of the partitions and can vary considerably from one process to another, resulting in significant load imbalance. Possible optimizations are proposed to address this issue, including the use of a distributed box-tree data structure, enabling finer filtering. Once calculated by the calling code, the interpolated data is finally exchanged following the same communication pattern as in the first stage of the location algorithm. All pairs of processes which partition AABBs intersect thus communicate, even if no points from one process have effectively been located in the mesh partition held by the other. Load imbalance and serialization can therefore compromise this step as well.

The \emph{Data Transfer Kit} (DTK) library~\cite{slattery2013data} addresses the load imbalance issue by creating a secondary \emph{rendezvous} decomposition~\cite{plimpton2004parallel}, well balanced for the location problem. Points and cells outside the domain bounded by the intersection of the global source and target AABBs are first discarded. Recursive coordinate bisection is then performed on the combined source and target geometries. The MPI communicator is also recursively split along the way, leaving in the end one partition per process, each containing geometrically close source cells and target points. The splitting procedure aims to balance the combined number of source cells and target points. Some rendezvous partitions can thus end up containing virtually only source cells and no target points, or vice versa. This worst-case scenario can occur if the level of refinement of both source and target discretizations differ significantly. Geometric location in the rendezvous decomposition is accelerated using geometric binning or a local $k$d-tree. The authors also propose to perform the interpolation step in a well-balanced fashion in the same rendezvous decomposition, at the expense of additional communications required to transfer the source field data from the initial decomposition to the rendezvous decomposition.

The algorithm presented in the next section aims to remedy the shortcomings mentioned above, by developing a more refined preconditioning strategy and ensuring good load balance at each critical step.

\section{Point location algorithm}\label{sec:algo}

\subsection{Key concepts}

Before presenting this algorithm, the following paragraphs define the terms used in this paper and outline the key concepts at the core of our point location algorithm.

\subsubsection{Point location problem}

We focus on data transfer between unstructured meshes and point clouds, with interpolation schemes using stencils restricted to the source cell containing each target point. Other types of interpolation with wider stencils (e.g.\ $k$ nearest neighbors, radial basis functions, etc.) would require a different preconditioning strategy, which will be studied in future work.

Given a set $S$ of \emph{source} cells and a set $T$ of \emph{target} points, point location consists in finding for each point in $T$ the \emph{host} cell from $S$ in which it lies, if any.

The two sets $S$ and $T$ are initially distributed on $P$ processes that form an arbitrary MPI communicator. All subsequent communications will occur within this communicator, which is provided as an input.

\subsubsection{Frames}

Dynamic load balancing is essential to achieve high performance on distributed-memory systems, and involves redistributing data across processes throughout the algorithm. In this paper, the different data distributions are called \emph{frames}. If $E$ designates a set of entities, the part of $E$ held by process $p$ in frame $\myframe{F}$ will be denoted by $E^{\myframe{F}}_p$. The size of this part is denoted by $\card{E^{\myframe{F}}_p}$. The \emph{input} frame will be denoted by $\myframe{I}$. In order to remain as generic as possible, no particular assumption is made on this frame. Some parts may contain more entities than others, some may even be empty. Some entities may also be held by multiple processes, as in the case of ghost cells or mesh vertices located on boundaries between adjacent partitions.

\subsubsection{Global identifiers}

To keep track of an entity (a source cell, its bounding box or a target point) moving across different frames, our algorithm relies on
\emph{global} identifiers (IDs). Contrary to the \emph{local} IDs used within each partition, global IDs are \emph{frame-independent} and \emph{unique}, meaning that if two entities on different processes share the same global ID they are in fact two instances of the same entity. This is essential for achieving reproducible results that do not depend on how data is distributed in the input frame. Such global numbering can either be provided as an input, or generated from any ordered data set.

\subsubsection{Dynamic load balancing framework}\label{sec:dynamic_load_balancing}

Dynamic load balancing is traditionally achieved using graph-based partitioning methods since they generally yield simply connected partitions with minimal edge cut. While these properties are essential for most computational codes, they are of little interest for solving the point location problem in parallel. When partition connectedness is not an issue, parallel sorting algorithms provide a more cost-effective solution for redistributing the workload. Global IDs are also used extensively for this purpose in our algorithm. Entities are re-partitioned by assigning each process an equal-sized block of entities with contiguous global IDs. This is achieved by sorting entities globally in ascending order of IDs using parallel bucket sort. If entities have associated weights, a parallel bucket sampling algorithm\footnote{An implementation on the GPU of this algorithm is presented in~\cite{cazalbou24}.} is used in order to devise blocks of equal weight prior to sorting.

\subsubsection{Subsets}

Some input data might not be relevant to the location problem, e.g.\ cells that are guaranteed to contain no points. Our algorithm aims to isolate the \emph{subsets} of interest by quickly pruning these irrelevant entities. Building new global numberings restricted to such subsets provides better conditioning for the sorting algorithms used for dynamic load balancing. Communication graphs are associated to each subset in order to enable data movement between the different frames. To make this possible, an explicit link between the subset and parent global numberings are maintained throughout the different steps of our algorithm. Table~\ref{tab:frames_subset} illustrates this concept.

\begin{table}[h!]
\caption{Illustration of a subset (second row) and its parent set (first row) distributed on three processes (each process is represented by a color). The subset is described in two different frames: $\myframe{F}_1$ which is balanced with respect to the parent set, and $\myframe{F}_2$ which is balanced with respect to the subset. Frame $\myframe{F}_2$ is obtained by redistributing the subset entities in ascending order of parent global IDs. This order is preserved in the global numbering proper to the subset.}\label{tab:frames_subset}
\resizebox{\textwidth}{!}{
\begin{tabular}{| l | K K K K | K K K K | K K K K | K K | K K | K K |}
\hline
& \multicolumn{12}{c|}{Frame $\myframe{F}_1$} & \multicolumn{6}{c|}{Frame $\myframe{F}_2$} \\
Global ID in parent set & \textcolor{drank0}{3} & \textcolor{drank0}{7} & \textcolor{drank0}{1} & \textcolor{drank0}{9} & \textcolor{drank1}{2} & \textcolor{drank1}{6} & \textcolor{drank1}{10} & \textcolor{drank1}{12} & \textcolor{drank2}{5} & \textcolor{drank2}{8} & \textcolor{drank2}{4} & \textcolor{drank2}{11} & \textcolor{drank0}{4} & \textcolor{drank0}{6} & \textcolor{drank1}{7} & \textcolor{drank1}{8} & \textcolor{drank2}{10} & \textcolor{drank2}{11} \\
Global ID in subset &  -- & \textcolor{drank0}{3} &  -- &  -- &  -- & \textcolor{drank1}{2} & \textcolor{drank1}{5} &  -- &  -- & \textcolor{drank2}{4} & \textcolor{drank2}{1} & \textcolor{drank2}{6} & \textcolor{drank0}{1} & \textcolor{drank0}{2} & \textcolor{drank1}{3} & \textcolor{drank1}{4} & \textcolor{drank2}{5} & \textcolor{drank2}{6} \\
\hline
\end{tabular}}
\end{table}

\subsection{Algorithm outline}\label{sec:algorithm}

The point location algorithm presented in this paper is structured in the following five main steps. A first coarse-grained filter is followed by a second, finer preconditioning step based on fast point-in-box tests. This search for candidate cell-point pairs is accelerated using a distributed tree structure, which allows to redistribute evenly the location workload. The third step consists in computing exact point-in-cell location for all candidate pairs. Potential conflicts are then resolved in a fourth step, in order to retain at most one host cell per point. Last, the source-to-target process-to-process communication channels are established for subsequent exchanges of interpolated data.

These five steps are detailed in the next sections and illustrated with a basic, two-dimensional example.

\subsection{Coarse filtering}\label{sec:coarse_filtering}

The first step in our point location algorithm consists in a coarse filtering similar to the one proposed by Plimpton et~al.~\cite{plimpton2004parallel}. The aim is to quickly eliminate cells and points that are clearly irrelevant to the location problem. Each process computes the AABB of the source cells in its partition. A collective reduction operation is then performed to obtain the global AABB of $S$, denoted by $\aabb{S}$ (shown in solid gray line in Figure~\ref{fig:input_frame_filter}). Let $T'$ denote the subset of target points located inside $\aabb{S}$. These are the only points that can possibly be located inside a source cell. The global AABB $\aabb{T'}$ of $T'$ is then computed in a similar way (shown in dashed black line in Figure~\ref{fig:input_frame_filter}). Let $S'$ denote the subset of cells whose AABB intersects $\aabb{T'}$. These are the only cells that can possibly contain target points. Cells (resp.\ points) not in $S'$ (resp.\ $T'$) will no longer be considered in the remainder of the algorithm. This step proves useful when the source and target geometries overlap only partially (such as in the example of Figure~\ref{fig:input_frame}), yet induces very little overhead if they overlap completely, as will be highlighted in Section~\ref{sec:perfo}.

At this stage, each process holds a (possibly unequal) part of each subset $S'$ and $T'$, respectively denoted by $S'^{\myframe{I}}_p$ and $T'^{\myframe{I}}_p$. In the example shown in Figure~\ref{fig:input_frame}, $S'$ is distributed over almost only two processes.

\begin{figure}
\begin{subfigure}[t][][t]{0.48\textwidth}
\centering
\includegraphics[width=\textwidth]{03_src_tgt_user.pdf}
\caption{Whole partitions.}\label{fig:input_frame_all}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.48\textwidth}
\centering
\includegraphics[width=\textwidth]{04_src_tgt_prefilter.pdf}
\caption{Subsets $S'$ and $T'$ in the same frame. The solid gray and dashed black rectangles depict the global AABBs of $S$ and $T'$, respectively.}\label{fig:input_frame_filter}
\end{subfigure}
\caption{Source mesh and target point cloud partitions in the input frame $\myframe{I}$ (each color represents a process).}\label{fig:input_frame}
\end{figure}

\subsection{Search for candidate pairs}\label{sec:search_candidates}

The search for candidate cell-point pairs relies on inexpensive point-in-box tests. However, the naive strategy that consists in checking all possible pairs local to each process becomes prohibitively expensive for large numbers of cells and points. Besides, at this stage, subsets $S'$ and $T'$ are arbitrarily distributed in the input frame, and their respective parts generally do not align.

Therefore, each process must identify to which other processes each of its points or boxes should be sent. This filtering is also based on AABB comparisons. To address the issues raised in Section~\ref{sec:related_work}, our algorithm relies on multiple boxes per process. This more refined filter helps reducing the amount of data exchanged in this step.

Sundar et~al.~\cite{sundar2008bottom} present a method for constructing distributed linear octrees that naturally yields such boxes. This octree structure can also be leveraged to speed up the second stage of the candidate search, local to each process.

In principle, the distributed box-tree proposed by Fournier~\cite{fournier2020massively} is quite similar to this structure. Both are constructed following a bottom-up approach, ensuring a good load balance using re-partitioning based on the Morton space-filling curve~\cite{morton1966computer}. However, whereas the box-tree requires fine-tuning of four parameters with complex combined effects, the octree proposed by Sundar is governed by just two parameters: the maximum depth of the tree and the maximum number of points contained in each of its leaf nodes.

\subsubsection{Octree construction}

The algorithm for constructing the distributed octree~\cite{sundar2008bottom} consists in the following main steps. Target points are first sorted globally in ascending Morton order and redistributed evenly to obtain a new frame $\myframe{O}$. In this frame, all partitions $\{T_p^{\prime\myframe{O}}\}$ contain an equal amount of points. This first step is further detailed in Section~\ref{sec:sfc_repartitioning}. The points are then sorted locally and converted into octree leaves at maximum depth. Then, a minimal linear octree is constructed in parallel by filling in the gaps between successive leaves by empty octants. Finally, the octree is coarsened so as to partition the point cloud into as few coarse \emph{blocks} as possible while maintaining good load balance. From each of these coarse blocks stems a finer sub-tree local only to the process owning that block. This algorithm ensures that the number of blocks per process is between~1 and~8. These sub-trees share a common ancestry, thus forming a complete, distributed octree. Such an octree (in fact a two-dimensional quadtree) is represented in Figure~\ref{fig:balanced_octree}.

\begin{figure}
\begin{subfigure}[t][][t]{0.48\textwidth}
\centering
\includegraphics[height=7cm]{05_octree.pdf}
\caption{Distributed quadtree and the associated target partitions in frame $\myframe{O}$. Coarse blocks (solid black rectangles) and their effective AABBs (dashed rectangles) are shown, along with the sub-trees local to each process.}\label{fig:balanced_octree}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.48\textwidth}
\centering
\includegraphics[height=7cm]{06_src_morton.pdf}
\caption{AABBs of the source cells in the Morton SFC-based balanced frame $\myframe{B}$.}\label{fig:src_aabb_morton}
\end{subfigure}
\caption{SFC-based frames for target and source entities.}\label{fig:octree_frame}
\end{figure}

\subsubsection{Octree query}

Once octree construction is complete, our algorithm proceeds as follows. The coarse blocks are gathered on all processes so that each process can determine to which other processes it should send the source AABBs it owns. A given source AABB is sent to a process if it intersects at least one of the coarse blocks owned by this process. In order to further reduce the amount of false positive detections, each block is reduced to the AABB of the points it contains. Such AABBs are shown as dashed rectangles in Figure~\ref{fig:balanced_octree}.

As the number of processes increases, the memory and CPU cost of storing and querying these blocks can become problematic. CPU overhead can be amortized by storing the blocks in a tree structure, thus enabling queries in $\bigO\parens[\big]{\card{S'_p} \log(P)}$ time. This operation is trivial, since the blocks are natively obtained from a coarse octree. The shared coarse tree associated to the example in Figure~\ref{fig:balanced_octree} is represented in Figure~\ref{fig:coarse_octree}. Besides, a single instance of this coarse tree can be stored on each compute node, taking advantage of shared memory. On typical supercomputers with tens of cores per compute node, the memory footprint of the coarse tree can thus be reduced by at least an order of magnitude.

\begin{figure}
    \begin{tikzpicture}
      [level distance=12mm,
       every node/.style={draw=black, thick, fill=black,circle,inner sep=1pt,minimum width=6pt,minimum height=6pt},
       edge from parent/.style={draw,thick},
       level 1/.style={sibling distance=18mm,nodes={}},
       level 2/.style={sibling distance=4mm,nodes={}}]
      \node {}
         child { node {}
             child { node [fill=rank0] {}}
             child { node [fill=rank0] {}}
             child { node [fill=rank1] {}}
             child { node [fill=rank1] {}}
             }
         child { node {}
             child { node [fill=rank2] {}}
             child { node [fill=rank2] {}}
             child { node [fill=rank3] {}}
             child { node [fill=rank3] {}}
             }
         child { node {}
             child { node [fill=rank4] {}}
             child { node [fill=rank4] {}}
             child { node [fill=rank5] {}}
             child { node [fill=rank5] {}}
             }
         child { node {}
             child { node [fill=rank6] {}}
             child { node [fill=rank6] {}}
             child { node [fill=rank7] {}}
             child { node [fill=rank7] {}}
             };
    \end{tikzpicture}
    \caption{Schematic view of the shared coarse tree.     Each leaf corresponds to a coarse block, colored by its owner process.}
    \label{fig:coarse_octree}
\end{figure}

Querying the coarse tree thus provides a connection between each source AABB and its processes of interest. This connection is used to exchange the boxes through collective MPI communications. As a result, each process receives a partition $S_p^{\prime\myframe{O}}$ of source AABBs in the octree frame $\myframe{O}$. The part of the octree local to this process is then inspected in order to find candidate points for each of these boxes.

Despite the acceleration provided by the tree structure, querying the coarse blocks can become a critical step if the partitions $S'^{\myframe{I}}_p$ are unbalanced. This is typically the case in the example shown in Figure~\ref{fig:input_frame_filter}. Moreover, an arbitrary distribution of boxes can result in a highly dense communication graph, significantly increasing the latency of collective communications.

To address these issues, the source AABBs are redistributed before querying the shared coarse tree. A good heuristic is to apply the same sorting and redistribution as performed on the points in the first step of octree construction. The new, balanced frame thus obtained is denoted by $\myframe{B}$. As shown in Figure~\ref{fig:src_aabb_morton}, the source partitions $\{S'^{\myframe{B}}_p\}$ are mostly aligned with the target partitions $\{T'^{\myframe{O}}_p\}$. The next paragraph briefly describes how this re-partitioning is achieved.

\vspace{.2\baselineskip}

\subsubsection{SFC-based re-partitioning}\label{sec:sfc_repartitioning}

Given an arbitrarily distributed set of geometric entities (i.e.\ points or boxes), the goal is to find a new, well-balanced distribution that maximizes data locality. Space filling curves (SFC) have been used extensively for this purpose~\cite{aluru1997parallel, borrell2018parallel}.

SFCs map continuously points in three-dimensional space to one-dimensional space. These curves have inherent multi-resolution properties due to their iterative construction, making them attractive for constructing linear tree structures. Setting a maximum resolution reduces space to a finite number of voxels. The cartesian coordinates of a point can then be encoded as a nonnegative integer, corresponding to its voxel index, dictated by the traversal order of the SFC. The Morton SFC (or ``Z-order curve'') \cite{morton1966computer} provides a straightforward encoding that can be very efficiently computed using bit shifts. The Morton ordering is equivalent to that obtained by traversing an octree depth first.

Subsets $S'$ and $T'$ are re-partitioned independently, but in the same way. The strategy described in Section~\ref{sec:dynamic_load_balancing} is followed, except that Morton codes are used instead of arbitrary global IDs. These Morton codes are computed by encoding either the target point coordinates or the coordinates of the source AABB centers. The subset global IDs thus obtained correspond to the order in the globally sorted array of Morton codes.

The re-partitioning induces a new balanced frame for each subset $S'$ and $T'$, to which data must be communicated from the input frame. The coordinates of the target points are communicated, along with only the AABBs of the source cells. The search for candidate pairs indeed requires no additional information such as mesh connectivity or vertex coordinates. To obtain the final source-to-target mapping in the input frame, as discussed in Section~\ref{sec:return_to_input_frame}, it is essential to maintain an explicit link between subset entities and their corresponding parent sets. The global IDs of entities in the parent sets $S$ and $T$ are therefore communicated to the new frames as well.

\goodbreak

Since $S'$ and $T'$ are re-partitioned independently from each other, the exchange of point data can be overlapped by computations for finding the balanced distribution of source cells, using non-blocking collective communications. Moreover, the exchange of cell data can be overlapped by the construction of the distributed octree.

\subsection{Exact point-in-cell location}\label{sec:exact_location}

Once the octree query is complete, a connection is obtained between the source cells and their candidate points. This connection is represented in the form of a graph that is distributed in the octree frame $\myframe{O}$.

The next step consists in computing the exact location of these candidate points for each cell. Weights for subsequent interpolation of source data at target points must also be calculated. Applications based on the Finite Element method usually rely on shape function interpolation. To this end, the isoparametric coordinates of each candidate point are determined using Newton--Raphson iteration. When dealing with ill-conditioned shape functions (e.g.\ highly curved high-order elements), this iteration may fail to converge. In this case a second, more robust technique is used, which consists in recursively subdividing the cell after a first decomposition into simplices. When dealing with meshes composed of arbitrary polygonal or polyhedral cells, a first inclusion test determines whether each candidate point lies inside or outside the cells. In the first case, mean value coordinates~\cite{hormann2006mean, ju2005} are calculated to serve as interpolation weights. In the second case, the nearest projection onto the cell's faces is computed, along with the distance from this projection.

These exact location computations could be carried out in the frame $\myframe{O}$. However, this frame is not guaranteed to be well balanced. Some processes may indeed hold more cells than others, or cells with more candidate points. Load balance in this stage is critical since computing exact location is expensive. In addition, these computations require a full description of the source cells, i.e.\ mesh connectivity along with vertex coordinates, whereas only AABBs are available at this point in frame $\myframe{O}$. This additional information is only available in the input frame $\myframe{I}$.

The choice is thus made to create a new, better balanced rendezvous frame in which exact location will be computed. In order to keep data movements to a minimum, the rendezvous frame is created such that each cell is owned by a single process. Points may in turn be duplicated on multiple processes. The number of candidate points in the AABB of each cell gives a simple yet effective estimation of the workload associated to that cell. This way, cells with no candidate points can simply be discarded. The different cell types could also be taken in consideration to further refine this estimate.

This rendezvous frame is obtained using the approach described in Section~\ref{sec:dynamic_load_balancing} by sorting cells in ascending order of the subset global IDs derived from the SFC-based re-partitioning. The data locality provided by this global numbering reduces duplication of mesh entities (e.g.\ faces, vertices) since adjacent cells are likely to be owned by the same process in the rendezvous frame, as indicated in Figure~\ref{fig:rendezvous_all}.

\begin{figure}
\begin{subfigure}[t][][t]{0.49\textwidth}
\centering
\includegraphics[height=7cm]{06_src_tgt_extract.pdf}
\caption{Partitions of candidate cells and points in the rendezvous frame $\myframe{R}$. Points duplicated on multiple processes and are shown in multiple colors.}\label{fig:rendezvous_all}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.49\textwidth}
\centering
\includegraphics[height=4.5cm]{location_conflict.pdf}
\caption{Location conflict for a point with multiple candidate cells (colored by owning process in frame $\myframe{R}$).}\label{fig:rendezvous_conflict}
\end{subfigure}
\caption{Exact point-in-cell location steps.}\label{fig:rendezvous}
\end{figure}

\subsection{Conflicts resolution}

Each target point may be contained in the AABB of multiple source cells. Besides, these cells may reside on different processes in the rendezvous frame, and therefore so do the target points. In the end, each point must be mapped to at most one host cell. Possible conflicts are resolved by filtering the location data computed in the previous step in order for each point to keep only the best match among its associated candidate cells. This filtering is performed in two stages. First, the candidates local to each process are sorted in ascending order of signed distance.\footnote{A negative distance indicates that the point is contained inside the cell.} The nearest candidate in that process is then kept, while the others are discarded. The second stage is similar, except that this time the sorting is carried out on the best candidates identified by all the processes in the previous stage. If a point is located exactly between two cells, the conflict is resolved by selecting the cell with lower global ID. This way, the location algorithm is guaranteed to yield reproducible results, that do not depend on the number of processes. In order to balance the workload, this second stage is performed in a new frame $\myframe{C}$.

The approach described in Section~\ref{sec:dynamic_load_balancing} is used once again to devise such a balanced frame. Distance values of the cells that pass the first filter are exchanged from the rendezvous frame $\myframe{R}$ to the conflict-resolution frame $\myframe{C}$. The result of the second filter is then sent back to frame $\myframe{R}$ so that each cell can discard candidate points for which it is not the best match.

In the situation depicted in Figure~\ref{fig:rendezvous_conflict}, a target point $t$ is associated to four candidate cells, distributed on three processes in frame $\myframe{R}$ (indicated by colors). On the blue process, the first filter retains the nearest cell, namely $s_3$. Distances from $s_1$, $s_2$ and $s_3$ are then communicated to the process holding $t$ in frame $\myframe{C}$. The second filter finally retains $s_1$.

\subsection{Return to the input frame}\label{sec:return_to_input_frame}

At this stage, all false positive detections have been eliminated. The remaining cell-point pairs form the actual mapping used for transferring data between the source and target domains. However, this data is usually distributed in the same frame as the input source and target partitions, namely frame $\myframe{I}$. This data could be first communicated from frame $\myframe{I}$ to the rendezvous frame $\myframe{R}$, in which interpolation would be performed, as suggested by Plimpton et~al.~\cite{plimpton2004parallel}. Nevertheless, frame $\myframe{R}$ may no longer be well balanced since the connection between cells and points has just been pruned. Besides, additional application-specific information may be required for spatial interpolation, and would need to be communicated to frame $\myframe{R}$ as well. Experience shows that the interpolation step is about two orders of magnitude less expensive than the location step. Possible load imbalance due to poor distribution in the input frame therefore does not compromise performance. Sticking to frame $\myframe{I}$ for interpolation is thus a sensible choice. The source-to-target mapping is thus transferred from the rendezvous frame to the input frame.

A process-to-process communication pattern is finally established between the source and target partitions in this input frame, to enable subsequent exchanges of interpolated data.

\section{Performance results}\label{sec:perfo}

The point location algorithm presented in this paper has been implemented in the open-source CWIPI coupling library~\cite{cwipi, cwipi_github}, along with high-level wrappings to MPI primitives to carry out the transfer of interpolated data using non-blocking send/receive communications, based on the previously described communication pattern. CWIPI formerly relied on the FVM library for data mapping, using the algorithm mentioned in Section~\ref{sec:related_work}. The tests presented in this section were carried out in version 1.1.0 of CWIPI~\cite{cwipi_github} which incorporates both algorithms, thereby enabling a direct comparison.

\subsection{Test cases description}

The following setup is considered to study and compare the performance of the two point location algorithms. Starting from a cartesian grid, an unstructured source mesh is obtained by decomposing the hexahedral cells into tetrahedra. To break the alignment with the cartesian axes, the geometry is deformed and a slight random perturbation is applied to the vertex coordinates, as shown in Figure~\ref{fig:test_meshes}.

\begin{figure}[h]
\begin{subfigure}[t][][t]{0.48\textwidth}
\centering
\includegraphics[height=4.2cm]{volume_mesh.png}
\caption{Volume source mesh used in the full and partial volume overlap scenarios.}\label{fig:test_meshes_vol}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.48\textwidth}
\centering
\includegraphics[height=4.2cm]{surface_mesh.png}
\caption{Surface mesh extraction used in the surface overlap scenario.}\label{fig:test_meshes_surf}
\end{subfigure}
\caption{Meshes used in the tests (colors indicate the different partitions).}\label{fig:test_meshes}
\end{figure}

A second mesh is generated using the same procedure, and its cell centers are employed as the target point cloud. Grids with slightly different numbers of points are used so that the two meshes do not match exactly. The source and target meshes are partitioned using the PT-Scotch library~\cite{chevalier2008pt} on two separate MPI communicators, each with the same number of processes. The communicators are then merged by CWIPI into a single COMM\_WORLD communicator. Since the partitioning method used is not deterministic, the sensitivity of both algorithms to input distributions is examined by executing each test case multiple times. A weak scaling study is carried out, with roughly $2.5 \cdot 10^5$ cells (resp.\ points) in each source (resp.\ target) partition. First, a source-to-target mapping is constructed using the point location algorithms. Then, a field evaluated on the source mesh is interpolated and transferred to the target points using either the communication scheme proposed by FVM or the one described in the end of Section~\ref{sec:return_to_input_frame}. Interpolation here consists simply in assigning each target point the field value of its host cell, as the study focuses primarily on the performance of the parallel algorithm (the use of a more refined scheme such as shape-function interpolation would have no impact on overall performance, since the interpolation weights are calculated regardless). The code for reproducing the test cases (as well as the input meshes, which are automatically generated at runtime) is available online (see Section~\ref{sec:supp_data} for the details).

Different scenarios representative of the wide range of real-life applications are realized by offsetting the meshes relative to each other in the $x$-direction, as represented in Figure~\ref{fig:test_scenarios}. For each of the three configurations considered, execution times for the location and interpolation steps are reported for both algorithms. Minimum, mean and maximum times over the multiple executions of each test are reported. Finally, a detailed breakdown of the location time is presented for the new algorithm.

\begin{figure}[h!]
\begin{subfigure}[t][][t]{0.33\textwidth}
\centering
\begin{tikzpicture}[]
\node[img] (i1) at (0,0) {\includegraphics[width=2.24cm]{visu_mesh_0.pdf}};
\node[img] (i2) at (0,0) {\includegraphics[width=2.24cm]{visu_mesh_1.pdf}};
\end{tikzpicture}
\caption{Full volume overlap.}\label{fig:test_full_volume}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.33\textwidth}
\centering
\begin{tikzpicture}[]
\node[img] (i3) at (0,0) {\includegraphics[width=2.24cm]{visu_mesh_0.pdf}};
\node[img] (i4) at (1.75cm,0) {\includegraphics[width=2.24cm]{visu_mesh_1.pdf}};
\end{tikzpicture}
\caption{Partial volume overlap.}\label{fig:test_partial_volume}
\end{subfigure}
\begin{subfigure}[t][][t]{0.33\textwidth}
\centering
\begin{tikzpicture}[]
\node[img] (i5) at (0,0) {\includegraphics[width=2.24cm]{visu_mesh_0.pdf}};
\node[img] (i6) at (1.848cm,0) {\includegraphics[width=2.24cm]{visu_mesh_1.pdf}};
\end{tikzpicture}
\caption{Surface overlap.}\label{fig:test_surface}
\end{subfigure}
\caption{Schematic view of the three scenarios. The partitioned source and target domains are shown in blue and orange, respectively.}\label{fig:test_scenarios}
\end{figure}

\subsection{Hardware description}

All the tests presented in this paper were carried out on the same supercomputer. Each of its compute nodes is composed of two \qty{2.4}{GHz} Intel Xeon~6240R (Cascade Lake) processors, each one containing 24~cores, totalling 48~cores per node. The nodes are interconnected by an Intel OmniPath \qty{100}{GB/s} low-latency network. All experiments were run using one MPI rank per core.

\subsection{Performance analysis}

\subsubsection{Full volume overlap}\label{sec:full_overlap}

In the first test case, the source and target geometries overlap completely. This scenario arises in situations such as solution transfer before restarting a simulation on a different mesh. In this case, the coarse filtering step described in Section~\ref{sec:coarse_filtering} reveals useless, since the global source and target AABBs are essentially identical. However, it induces very little overhead since it accounts for less than~\qty{1}{\percent} of the total execution time. This step is therefore omitted in the breakdown given in Figure~\ref{fig:weak_scaling_volume_breakdown}.

Although the source mesh is evenly distributed, the volume of the partition AABBs can vary considerably between processes. In the worst case, a volume ratio of~300 is observed between the largest and smallest AABBs. The number of points to be located by each process in the FVM algorithm is equally unbalanced, as anticipated in Section~\ref{sec:related_work}. Furthermore, Figure~\ref{fig:weak_scaling_volume} shows that the execution time of the FVM algorithm varies significantly between different runs, indicating a high sensitivity to the input partitioning. In contrast, the new algorithm delivers the same execution time regardless of the partitioning, and exhibits greater parallel efficiency. This improvement is explained by the finer preconditioning (Section~\ref{sec:search_candidates}) and the redistribution of the location workload (Section~\ref{sec:exact_location}). A maximum load imbalance of~\qty{10}{\percent} is observed in the rendezvous frame.

Figure~\ref{fig:weak_scaling_volume_exchange} demonstrates the benefits of optimizing the communication graph used to exchange the interpolated data. The strategy described in Section~\ref{sec:return_to_input_frame} proves reasonable, considering the small proportion of execution time devoted to the ``return to input frame'' step, as shown in Figure~\ref{fig:weak_scaling_volume_breakdown}. Figure~\ref{fig:ratio_loc_interp_volume} also confirms that the interpolation step is significantly cheaper than point location (by about two orders of magnitude).

\begin{figure}[h!]
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{weak_scaling_qs_n_ov_-1.pdf}
\caption{Point location.}\label{fig:weak_scaling_volume_location}
\end{subfigure}
~
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{weak_scaling_qs_n_ov_-1_exchange.pdf}
\caption{Interpolation and exchange.}\label{fig:weak_scaling_volume_exchange}
\end{subfigure}
~
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{ratio_loc_interp_n_ov_-1.pdf}
\caption{Ratio point location / interpolation and exchange.}\label{fig:ratio_loc_interp_volume}
\end{subfigure}
\caption{Execution times for the FVM and CWIPI algorithms (full volume overlap). Average times over all runs are represented by solid lines, while shaded areas illustrate the range of elapsed times across all runs.}\label{fig:weak_scaling_volume}
\end{figure}

\begin{figure}[h!]
\includegraphics[width=0.6\textwidth]{weak_scaling_qs_breakdown_n_ov_-1.pdf}
\caption{Breakdown of average execution time for the new point location algorithm (full volume overlap).}\label{fig:weak_scaling_volume_breakdown}
\end{figure}

In applications with dynamic geometries, point location must be performed frequently. Higher frequency generally translates into greater fidelity and numerical robustness. The CPU cost of data mapping should ideally be comparable to that of one iteration of a computational code, which is on the order of~\qty{1}{\micro\second} per cell for state-of-the art computational fluid dynamics solvers. In comparison, the CPU cost of the new point location algorithm ranges from~\qty{20}{\micro\second} to~\qty{50}{\micro\second} per target point in this test. Dynamic simulations with large meshes thus remain challenging, and optimizations are necessary to overcome the data mapping bottleneck. As indicated in Figure~\ref{fig:weak_scaling_volume_breakdown}, these optimizations should focus primarily on the preconditioning stage.

\FloatBarrier

\subsubsection{Partial volume overlap}\label{sec:partial_overlap}

In the second test case, the target point cloud is shifted so that it overlaps only a thin layer of the source mesh (a few cells thick). This scenario mimics applications such as the multi-component simulation presented in~\cite{arroyo21}, in which data mapping is used to integrate the different moving parts of a full aircraft engine into a single unsteady large-eddy simulation.

Full source and target geometries are provided to the location algorithms, letting them filter out irrelevant points and cells. Figure~\ref{fig:weak_scaling_partial} shows that variations in the input partitioning have a strong impact on the performance of the FVM algorithm, as in the first test case. Besides, most processes are idle during the location step, since the AABB of their source mesh partition intersect no target partition AABB. This load imbalance results in suboptimal parallel efficiency.

\begin{figure}[h!]
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{weak_scaling_qs_n_ov_1.pdf}
\caption{Point location.}\label{fig:weak_scaling_partial_location}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{weak_scaling_qs_n_ov_1_exchange.pdf}
\caption{Interpolation and exchange.}\label{fig:weak_scaling_partial_exchange}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{ratio_loc_interp_n_ov_1.pdf}
\caption{Ratio point location / interpolation and exchange.}\label{fig:ratio_loc_interp_partial}
\end{subfigure}
\caption{Execution times for the FVM and CWIPI algorithms (partial volume overlap). Average times over all runs are represented by solid lines, while shaded areas illustrate the range of elapsed times across all runs.}\label{fig:weak_scaling_partial}
\end{figure}

\begin{figure}[h!]
\includegraphics[width=0.6\textwidth]{weak_scaling_qs_breakdown_n_ov_1.pdf}
\caption{Breakdown of average execution time for the new point location algorithm (partial volume overlap).}\label{fig:weak_scaling_partial_breakdown}
\end{figure}

The new algorithm effectively eliminates up to~\qty{95}{\percent} of the source cells and target points in the first coarse filtering step, once again with minimal extra cost.
Dynamic load balancing proves less effective than in the first test case, since the average workload per process is considerably reduced.
In fact, the SFC-based re-partitioning step accounts for about~\qty{70}{\percent} of the total execution time, as reported in Figure~\ref{fig:weak_scaling_partial_breakdown}.
A more detailed analysis reveals that the bucket sampling algorithm is responsible for~\qty{60}{\percent} of the cost of this step.
This algorithm relies on parameters tuned using heuristics, which could be further optimized for our application. The communication graph used by the FVM algorithm to transfer the interpolated data is much sparser than in the first case. Consequently, both algorithms yield comparable execution times for this step. Still, the new point location algorithm performs better on average and remains much less sensitive to the input data distribution. In this second test, the location step takes around~\qty{10}{\micro\second} per target point\footnote{All the target points are considered, including the ones not contained in any cell.} with the new algorithm, which is still quite expensive compared to an iteration of fluid simulation.

\FloatBarrier

\subsubsection{Surface overlap}\label{sec:surface_overlap}

In the third test case, the target point cloud is shifted so that it overlaps only one boundary face of the source domain. This scenario is representative of surface couplings such as simulation of fluid-structure interaction~\cite{fabbri2023design}. In such applications, data transfer is typically used to apply specific boundary conditions. Only the surface geometric entities and their associated DoFs are therefore considered.

As in the other two test cases, volume meshes are first generated and partitioned. The surface of interest is then extracted without subsequent redistribution, as shown in Figure~\ref{fig:test_meshes_surf}. Consequently, the source cells and target points are unevenly distributed in this input frame, since the surface is contained in only a fraction of the volume partitions. This fraction diminishes as the number of processes increases. Nevertheless, the deterioration in load imbalance is tempered by a decrease in the average number of cells and points per partition, which scales as $\bigO(P^{-1/3})$. In this situation, communication latency becomes predominant. The new algorithm carries out numerous data movements and is therefore penalized.

\begin{figure}[h!]
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{weak_scaling_qs_n_ov_0.pdf}
\caption{Point location.}\label{fig:weak_scaling_surface_location}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{weak_scaling_qs_n_ov_0_exchange.pdf}
\caption{Interpolation and exchange.}\label{fig:weak_scaling_surface_exchange}
\end{subfigure}
\hfill
\begin{subfigure}[t][][t]{0.32\textwidth}
\centering
\includegraphics[width=\textwidth]{ratio_loc_interp_n_ov_0.pdf}
\caption{Ratio point location / interpolation and exchange.}\label{fig:ratio_loc_interp_surface}
\end{subfigure}
\caption{Execution times for the old and new algorithms (surface overlap). Average times over all runs are represented by solid lines, while shaded areas illustrate the range of elapsed times across all runs.}\label{fig:weak_scaling_surface}
\end{figure}

A solution to improve parallel efficiency consists in using fewer processes so that the increased average workload compensates for the communication overhead. This strategy is tested by splitting the initial MPI communicator (here COMM\_WORLD), to obtain the sub-communicator COMM\_SURFACE restricted to processes holding a non-empty share of the surface in the input frame. Figures~\ref{fig:weak_scaling_surface} and~\ref{fig:weak_scaling_surface_breakdown} demonstrate the effectiveness of this strategy. In multi-component simulations such as in~\cite{arroyo21}, multiple simultaneous data transfers need to be carried out, in relatively small, distinct geometric regions. Assigning one sub-communicator to each data transfer task would allow for a more effective utilization of computational resources. Yet, this approach could be refined by determining an optimal communicator size based on an assessment of the total workload. Splitting the working communicator after the coarse filtering step could enable even further improvement. However, if data transfer is considered as part of a multi-component simulation, performance remains acceptable (on the order of~\qty{1}{\micro\second} per target point)\footnote{The whole \emph{volume} mesh is considered.} and the proposed optimizations become less significant.

\begin{figure}
\includegraphics[width=0.6\textwidth]{weak_scaling_qs_breakdown_n_ov_0.pdf}
\caption{Breakdown of average execution time for the new point location algorithm (surface overlap), using either COMM\_WORLD or COMM\_SURFACE as the input MPI communicator.}\label{fig:weak_scaling_surface_breakdown}
\end{figure}

\FloatBarrier

\section{Conclusion}

In this paper a novel point location algorithm has been presented, designed for data mapping between distributed meshes and point clouds in a massively parallel environment. Comparisons with a state-of-the-art algorithm are favorable, with a speed-up of up to a factor of~10. This achievement is made possible by a more refined preconditioning strategy combined with dynamic load balancing.

Good scaling is observed for up to \num{1.2}~billion cells and points on 4,800~CPU cores, proving that our algorithm can be integrated in a wide range of real-life, large-scale data transfer applications. In order to prepare for the exascale era, the performance analysis still needs to be carried out on larger supercomputers. For instance, the first and last step of the algorithm (see Section~\ref{sec:sfc_repartitioning} and~\ref{sec:return_to_input_frame} respectively) rely heavily on collective (all-to-all) MPI communications which could become a bottleneck with a higher number of MPI ranks.

Work is also underway to harness the full potential of heterogeneous architectures and accelerate the preconditioning stage through CPU-GPGPU hybridization~\cite{cazalbou24}. The expected speed-up should help overcome the current bottleneck experienced in dynamic simulations. Finally, the possible optimizations proposed in this paper to mitigate the communication overhead will also be explored in future work.

\section*{Acknowledgments}

The manuscript was written through contributions of all authors.

\goodbreak

\section*{Underlying data}\label{sec:supp_data}

The tests presented in Section~\ref{sec:perfo} can be reproduced by running the Python script \texttt{python\_perfo\_location\_octree.py} available in the \texttt{tests/} directory after cloning CWIPI's GitHub repository: \url{https://github.com/onera/cwipi/blob/master/tests/python_perfo_location_octree.py}.

\section*{Declaration of interests}

The authors do not work for, advise, own shares in, or receive funds from any organization that could benefit from this article, and have declared no affiliations other than their research organizations.

\printbibliography

\end{document}