Skip to content

Commit

Permalink
Add support of selecting different from subsets proportionately (#270)
Browse files Browse the repository at this point in the history
* Use numpy to count the unique number of labels for efficiency

* Add support of selection different cluster subsets proportionately

* Add tests for the proportional selection

* Fix the problem of when zero number of elements selected from the minority class

* Add tests for imbalance case of multiple classes

* Add testing data for imbalance cases

* Reformat with black

* Add data points to smallest cluster when not enough data points

* Add test for checking the number of labels match the number of total data points

* Ignore NotImplementedError in coverage report

* Add typing hints for returns

* Add typing hints
  • Loading branch information
FanwangM authored Oct 6, 2024
1 parent 90acdfe commit 2eb1d7f
Show file tree
Hide file tree
Showing 12 changed files with 575 additions and 91 deletions.
4 changes: 4 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ omit =

[report]
show_missing = True
exclude_also =
pragma: no cover
raise NotImplementedError
if __name__ == .__main__.:
121 changes: 86 additions & 35 deletions selector/methods/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import warnings
from abc import ABC, abstractmethod
from typing import List, Iterable, Union

import numpy as np

Expand All @@ -34,7 +35,13 @@
class SelectionBase(ABC):
"""Base class for selecting subset of sample points."""

def select(self, x: np.ndarray, size: int, labels: np.ndarray = None) -> np.ndarray:
def select(
self,
x: np.ndarray,
size: int,
labels: np.ndarray = None,
proportional_selection: bool = True,
) -> Union[List, Iterable]:
"""Return indices representing subset of sample points.
Parameters
Expand All @@ -48,6 +55,10 @@ def select(self, x: np.ndarray, size: int, labels: np.ndarray = None) -> np.ndar
Array of integers or strings representing the labels of the clusters that
each sample belongs to. If `None`, the samples are treated as one cluster.
If labels are provided, selection is made from each cluster.
proportional_selection: bool, optional
If True, the number of samples to be selected from each cluster is proportional.
Otherwise, the number of samples to be selected from each cluster is equal.
Default is True.
Returns
-------
Expand All @@ -70,52 +81,92 @@ def select(self, x: np.ndarray, size: int, labels: np.ndarray = None) -> np.ndar
f"Number of labels {len(labels)} does not match number of samples {len(x)}."
)

selected_ids = []

# compute the number of samples (i.e. population or pop) in each cluster
unique_labels = np.unique(labels)
unique_labels, unique_label_counts = np.unique(labels, return_counts=True)
num_clusters = len(unique_labels)
pop_clusters = {
unique_label: len(np.where(labels == unique_label)[0]) for unique_label in unique_labels
}
pop_clusters = dict(zip(unique_labels, unique_label_counts))
# compute number of samples to be selected from each cluster
n = size // num_clusters

# update number of samples to select from each cluster based on the cluster population.
# this is needed when some clusters do not have enough samples in them (pop < n) and
# needs to be done iteratively until all remaining clusters have at least n samples
selected_ids = []
while np.any([value <= n for value in pop_clusters.values() if value != 0]):
for unique_label in unique_labels:
if pop_clusters[unique_label] != 0:
# get index of sample labelled with unique_label
cluster_ids = np.where(labels == unique_label)[0]
if len(cluster_ids) <= n:
# all samples in the cluster are selected & population becomes zero
selected_ids.append(cluster_ids)
pop_clusters[unique_label] = 0
# update number of samples to be selected from each cluster
totally_used_clusters = list(pop_clusters.values()).count(0)
n = (size - len(np.hstack(selected_ids))) // (num_clusters - totally_used_clusters)

warnings.warn(
f"Number of molecules in one cluster is less than"
f" {size}/{num_clusters}.\nNumber of selected "
f"molecules might be less than desired.\nIn order to avoid this "
f"problem. Try to use less number of clusters"
)

for unique_label in unique_labels:
if proportional_selection:
# make sure that tht total number of samples selected is equal to size
size_each_cluster = size * unique_label_counts / len(labels)
# using np.round to get to the nearest integer
# not using int function directly to avoid truncation of decimal values
size_each_cluster = np.round(size_each_cluster).astype(int)
# make sure each cluster has at least one sample
size_each_cluster[size_each_cluster < 1] = 1

# the total number of samples selected from all clusters at this point
size_each_cluster_total = np.sum(size_each_cluster)
# when the total of data points in each class is less than the required number
# add one sample to the smallest cluster iteratively until the total is equal to the
# required number
if size_each_cluster_total < size:
while size_each_cluster_total < size:
# the number of remaining data points in each cluster
size_each_cluster_remaining = unique_label_counts - size_each_cluster_total
# skip the clusters with no data points left
size_each_cluster_remaining[size_each_cluster_remaining == 0] = np.inf
smallest_cluster_index = np.argmin(size_each_cluster_remaining)
size_each_cluster[smallest_cluster_index] += 1
size_each_cluster_total += 1
# when the total of data points in each class is more than the required number
# we need to remove samples from the largest clusters
elif size_each_cluster_total > size:
while size_each_cluster_total > size:
largest_cluster_index = np.argmax(size_each_cluster)
size_each_cluster[largest_cluster_index] -= 1
size_each_cluster_total -= 1
# perfect case where the total is equal to the required number
else:
pass
else:
size_each_cluster = size // num_clusters

# update number of samples to select from each cluster based on the cluster population.
# this is needed when some clusters do not have enough samples in them
# (pop < size_each_cluster) and needs to be done iteratively until all remaining clusters
# have at least size_each_cluster samples
while np.any(
[value <= size_each_cluster for value in pop_clusters.values() if value != 0]
):
for unique_label in unique_labels:
if pop_clusters[unique_label] != 0:
# get index of sample labelled with unique_label
cluster_ids = np.where(labels == unique_label)[0]
if len(cluster_ids) <= size_each_cluster:
# all samples in the cluster are selected & population becomes zero
selected_ids.append(cluster_ids)
pop_clusters[unique_label] = 0
# update number of samples to be selected from each cluster
totally_used_clusters = list(pop_clusters.values()).count(0)
size_each_cluster = (size - len(np.hstack(selected_ids))) // (
num_clusters - totally_used_clusters
)

warnings.warn(
f"Number of molecules in one cluster is less than"
f" {size}/{num_clusters}.\nNumber of selected "
f"molecules might be less than desired.\nIn order to avoid this "
f"problem. Try to use less number of clusters."
)
# save the number of samples to be selected from each cluster in an array
size_each_cluster = np.full(num_clusters, size_each_cluster)

for unique_label, size_sub in zip(unique_labels, size_each_cluster):
if pop_clusters[unique_label] != 0:
# sample n ids from cluster labeled unique_label
# sample size_each_cluster ids from cluster labeled unique_label
cluster_ids = np.where(labels == unique_label)[0]
selected = self.select_from_cluster(x, n, cluster_ids)
selected = self.select_from_cluster(x, size_sub, cluster_ids)
selected_ids.append(cluster_ids[selected])

return np.hstack(selected_ids).flatten().tolist()

@abstractmethod
def select_from_cluster(
self, x: np.ndarray, size: int, labels: np.ndarray = None
) -> np.ndarray:
) -> np.ndarray: # pragma: no cover
"""Return indices representing subset of sample points from one cluster.
Parameters
Expand Down
38 changes: 21 additions & 17 deletions selector/methods/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import bitarray
import numpy as np
from scipy import spatial
from typing import List, Iterable, Union

from selector.methods.base import SelectionBase
from selector.methods.utils import optimize_radius
Expand Down Expand Up @@ -86,7 +87,7 @@ def __init__(self, fun_dist=None, ref_index=None):
self.fun_dist = fun_dist
self.ref_index = ref_index

def select_from_cluster(self, x, size, labels=None):
def select_from_cluster(self, x, size, labels=None) -> Union[List, Iterable]:
"""Return selected samples from a cluster based on MaxMin algorithm.
Parameters
Expand All @@ -102,7 +103,7 @@ def select_from_cluster(self, x, size, labels=None):
Returns
-------
selected : list
selected : Union[List, Iterable]
List of indices of selected samples.
"""
# calculate pairwise distance between points
Expand Down Expand Up @@ -134,6 +135,8 @@ def select_from_cluster(self, x, size, labels=None):
new_id = np.argmax(min_distances)
selected.append(new_id)

selected = [int(i) for i in selected]

return selected


Expand Down Expand Up @@ -184,7 +187,7 @@ def __init__(self, fun_dist=None, ref_index=None):
self.fun_dist = fun_dist
self.ref_index = ref_index

def select_from_cluster(self, x, size, labels=None):
def select_from_cluster(self, x, size, labels=None) -> Union[List, Iterable]:
"""Return selected samples from a cluster based on MaxSum algorithm.
Parameters
Expand All @@ -200,7 +203,7 @@ def select_from_cluster(self, x, size, labels=None):
Returns
-------
selected : list
selected : Union[List, Iterable]
List of indices of selected samples.
"""
Expand Down Expand Up @@ -237,6 +240,8 @@ def select_from_cluster(self, x, size, labels=None):
# already-selected points
new_id = np.argmax(sum_distances)
selected.append(new_id)

selected = [int(i) for i in selected]
return selected


Expand All @@ -261,6 +266,7 @@ class 0 and `ref_index=[3, 6]` class 1 respectively.
References
----------
[1] J. Chem. Inf. Comput. Sci. 1997, 37, 6, 1181–1188. https://doi.org/10.1021/ci970282v
"""

def __init__(
Expand Down Expand Up @@ -330,7 +336,7 @@ def __init__(
self.random_seed = random_seed
self.fun_dist = fun_dist

def algorithm(self, x, max_size) -> list:
def algorithm(self, x, max_size) -> Union[List, Iterable]:
"""Return selected sample indices based on OptiSim algorithm.
Parameters
Expand All @@ -342,7 +348,7 @@ def algorithm(self, x, max_size) -> list:
Returns
-------
selected : list
selected : Union[List, Iterable]
List of indices of selected sample indices.
"""
Expand Down Expand Up @@ -402,7 +408,7 @@ def algorithm(self, x, max_size) -> list:

return selected

def select_from_cluster(self, x, size, labels=None):
def select_from_cluster(self, x, size, labels=None) -> Union[List, Iterable]:
"""Return selected samples from a cluster based on OptiSim algorithm.
Parameters
Expand All @@ -416,7 +422,7 @@ def select_from_cluster(self, x, size, labels=None):
Returns
-------
selected : list
selected : Union[List, Iterable]
List of indices of selected samples.
"""
Expand Down Expand Up @@ -459,9 +465,7 @@ class DISE(SelectionBase):
"""

def __init__(
self, r0=None, ref_index=None, tol=0.05, n_iter=10, p=2.0, eps=0.0, fun_dist=None
):
def __init__(self, r0=None, ref_index=None, tol=0.05, n_iter=10, p=2.0, eps=0.0, fun_dist=None):
"""
Initialize class.
Expand Down Expand Up @@ -511,7 +515,7 @@ def __init__(
# self.fun_dist = fun_dist
self.fun_dist = fun_dist

def algorithm(self, x, max_size):
def algorithm(self, x, max_size) -> Union[List, Iterable]:
"""Return selected samples based on directed sphere exclusion algorithm.
Parameters
Expand All @@ -523,7 +527,7 @@ def algorithm(self, x, max_size):
Returns
-------
selected: list
selected: Union[List, Iterable]
List of indices of selected samples.
"""
Expand Down Expand Up @@ -593,7 +597,7 @@ def algorithm(self, x, max_size):

return selected

def select_from_cluster(self, x, size, labels=None):
def select_from_cluster(self, x, size, labels=None) -> Union[List, Iterable]:
"""Return selected samples from a cluster based on directed sphere exclusion algorithm
Parameters
Expand All @@ -607,7 +611,7 @@ def select_from_cluster(self, x, size, labels=None):
Returns
-------
selected: list
selected: Union[List, Iterable]
List of indices of selected samples.
"""
Expand All @@ -625,7 +629,7 @@ def select_from_cluster(self, x, size, labels=None):
return optimize_radius(self, x, size, labels)


def get_initial_selection(x=None, x_dist=None, ref_index=None, fun_dist=None):
def get_initial_selection(x=None, x_dist=None, ref_index=None, fun_dist=None) -> List:
"""Set up the reference index for selecting.
Parameters
Expand All @@ -650,7 +654,7 @@ def get_initial_selection(x=None, x_dist=None, ref_index=None, fun_dist=None):
Returns
-------
initial_selections: list
initial_selections: List
List of indices of the initial selected data points.
"""
Expand Down
1 change: 0 additions & 1 deletion selector/methods/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,4 +666,3 @@ def select_from_cluster(self, arr, num_selected, cluster_ids=None):
)
count += 1
return selected

Loading

0 comments on commit 2eb1d7f

Please sign in to comment.