Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4a70b22
initial progress
SamuelNeceda Oct 21, 2024
ccd7051
unit tests added
SamuelNeceda Oct 30, 2024
5d14afe
refactor to map
SamuelNeceda Oct 30, 2024
90e99e7
Update RandomServiceTest.java
SamuelNeceda Oct 30, 2024
a76280a
Merge branch 'main' into weighted_selection
SamuelNeceda Oct 30, 2024
2525cad
enhance handling for edge cases
SamuelNeceda Oct 31, 2024
10856e6
Merge branch 'main' into weighted_selection
SamuelNeceda Oct 31, 2024
8d0de4a
hardcoded custom data provider
SamuelNeceda Oct 31, 2024
754af80
increase tests coverage
SamuelNeceda Oct 31, 2024
8499822
reference test constants directly in the test class
SamuelNeceda Nov 1, 2024
ea4c662
check for duplicates & overflow prevention
SamuelNeceda Nov 1, 2024
0a0c726
improve efficiency
SamuelNeceda Nov 1, 2024
012b483
Merge branch 'main' into weighted_selection
SamuelNeceda Nov 1, 2024
4ea7223
update java doc
SamuelNeceda Nov 1, 2024
0d3b828
separate logic to WeightedRandomSelector
SamuelNeceda Nov 1, 2024
49efe88
register provider & add documentation
SamuelNeceda Nov 3, 2024
353a4a5
Merge branch 'main' into weighted_selection
SamuelNeceda Nov 3, 2024
6281667
fix test
SamuelNeceda Nov 9, 2024
f3e210d
Merge branch 'main' into weighted_selection
SamuelNeceda Nov 9, 2024
19ecfd6
increase tests coverage for WeightedRandomSelector
SamuelNeceda Nov 9, 2024
410bfff
fix
SamuelNeceda Nov 9, 2024
1074120
Merge branch 'main' into weighted_selection
SamuelNeceda Nov 17, 2024
5f7f43c
remove implementation from BaseFaker.java, ProviderRegistration.java,…
SamuelNeceda Nov 17, 2024
c65938d
pr review changes
SamuelNeceda Nov 17, 2024
60299a4
update doc
SamuelNeceda Nov 17, 2024
9a06e2d
Update FakeValuesService.java
SamuelNeceda Nov 17, 2024
a70472b
Update UniqueTest.java
SamuelNeceda Nov 17, 2024
1b04d0a
Merge branch 'main' into weighted_selection
SamuelNeceda Nov 18, 2024
1be2de5
Merge branch 'main' into weighted_selection
SamuelNeceda Nov 24, 2024
8112e46
Update pom_update.yml
SamuelNeceda Nov 24, 2024
0a92ca5
Merge branch 'weighted_selection' of https://github.com/SamuelNeceda/…
SamuelNeceda Nov 24, 2024
3f58c88
pr review
SamuelNeceda Nov 24, 2024
2712e8b
tests refactored
SamuelNeceda Nov 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions docs/documentation/custom-providers.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,15 @@ Create a custom provider of data:
=== "Java"

``` java
public class Insect extends AbstractProvider<BaseProviders> {
private static final String[] INSECT_NAMES = new String[]{"Ant", "Beetle", "Butterfly", "Wasp"};
public static class Insect extends AbstractProvider<BaseProviders> {
private static final WeightedRandomSelector selector = new WeightedRandomSelector(new Random());

private static final String[] INSECT_NAMES = { "Ant", "Beetle", "Butterfly", "Wasp" };
private static final List<Map<String, Object>> WEIGHTED_INSECTS = List.of(
Map.of("value", "Driver ant", "weight", 6.0),
Map.of("value", "Fire ant", "weight", 3.0),
Map.of("value", "Harvester ant", "weight", 1.0)
);

public Insect(BaseProviders faker) {
super(faker);
Expand All @@ -31,10 +38,13 @@ Create a custom provider of data:
public String nextInsectName() {
return INSECT_NAMES[faker.random().nextInt(INSECT_NAMES.length)];
}

public String weightedInsectName() {
return selector.select(WEIGHTED_INSECTS);
}
}
```



### Register provider

Create your own custom faker, which extends `Faker`, and register the custom provider:
Expand Down Expand Up @@ -66,6 +76,22 @@ This will print something like the following:
Wasp
```

**Usafe of weigted random selector is in the POC stage and is currently available only for custom hardcoded providers.**

To use a random selector based on weights, you can do the following:

=== "Java"

``` java
MyCustomFaker myFaker = new MyCustomFaker();
System.out.println(myFaker.insect().weightedInsectName());
```

This will return a random insect name but based on the provided weights
```
Driver ant
```

## Custom provider using Yaml file

In case you have a large set of data to load, it might be better to use a Yaml file.
Expand Down
136 changes: 136 additions & 0 deletions src/main/java/net/datafaker/service/WeightedRandomSelector.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package net.datafaker.service;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

/**
* A utility class for selecting a random element from a list based on assigned weights.
**/
public record WeightedRandomSelector(Random random) {
private static final String WEIGHT_KEY = "weight";
private static final String VALUE_KEY = "value";

public WeightedRandomSelector(Random random) {
this.random = random != null ? random : new Random();
}

/**
* Returns a weighted random element from the given list, where each element is represented as a Map
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I right that the only way to change weight is change file and rebuild everything?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The weights are provided as part of the input items list and are processed at runtime. Changing weights does not require rebuilding the application but simply involves modifying the input list or the file (if applicable) and rerunning the program.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this I didn't get...
We have files with data inside resources which are part of the jar right?
Also there might be custom data which could be either in resources or in separate files.

With normal files it is clear how to change weights without rebuilds
How can we do this with any from resources folder?

* containing a weight and the corresponding value.
* <p>
*
* @param items A list of maps, where each map contains:
* - weight: A Double representing the weight of the element, influencing its selection probability.
* - value: The actual element of type T to be randomly selected based on its weight.
* @param <T> The type of the element to be selected from the list. The value associated with the weight can be of any type.
* @return A randomly selected element based on its weight.
* @throws IllegalArgumentException if:
* - the list is null or empty,
* - any item in the list is null or empty,
* - the item does not contain 'weight' or 'value' keys,
* - any weight is null, non-positive, NaN or infinite,
* - any values in the list are not unique or null,
* - the sum of weights exceeds Double.MAX_VALUE.
*/
public <T> T select(List<Map<String, Object>> items) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like most of the methods here do not work with object state => could be turned to static

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relevant methods were turned to static and private

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about this method

<T> T select(List<Map<String, Object>> items) {

why can not it be switched to static?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
public <T> T select(List<Map<String, Object>> items) {
public static <T> T select(List<Map<String, Object>> items) {

this seems not addressed yet
not sure if we need it public

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the current design, the method public T select(List<Map<String, Object>> items) cannot be static because it relies on the random field of the WeightedRandomSelector record, which is an instance field.
If the goal is to make the method static, I will need to explicitly pass the Random object to the method.
From my point of view, the current implementation is correct.

validateItemsList(items);

Object[] values = new Object[items.size()];
double[] cumulativeWeights = preprocessItems(items, values);

double randomValue = random.nextDouble() * cumulativeWeights[cumulativeWeights.length - 1];
return selectWeightedElement(randomValue, cumulativeWeights, values);
}

private static void validateItemsList(List<Map<String, Object>> items) {
if (items == null) {
throw new IllegalArgumentException("Input list cannot be null");
}
if (items.isEmpty()) {
throw new IllegalArgumentException("Input list cannot be empty");
}

Set<Object> uniqueValues = new HashSet<>();

for (var item : items) {
validateItem(item);
assertUniqueValues(item, uniqueValues);
}
}

private static void assertUniqueValues(Map<String, Object> item, Set<Object> values) {
Object value = item.get(VALUE_KEY);
Copy link
Copy Markdown
Collaborator

@snuyanzin snuyanzin Nov 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't guarantee uniqueness
for instance there could be MyTest.yml

...
   name:
    - name1
   ref1:
     - #{MyTest.name}
    ref2:
      - #{MyTest.name}
    dataForWeighted:
      - value: #{MyTest.name}
         weight: 2.0
      - value: #{MyTest.ref1}
         weight: 3.0
      - value: #{MyTest.ref2}
         weight: 5.0

...

in fact values for dataForWeighted will be same however assertUniqueValues will say that it is unique...

if (!values.add(value)) {
throw new IllegalArgumentException("Duplicate value found: " + value + ". Values must be unique.");
}
}

private static void validateItem(Map<String, Object> item) {
if (item == null) {
throw new IllegalArgumentException("Item cannot be null");
}
if (item.isEmpty()) {
throw new IllegalArgumentException("Item cannot be empty");
}
if (!item.containsKey(WEIGHT_KEY) || !item.containsKey(VALUE_KEY)) {
Copy link
Copy Markdown
Collaborator

@snuyanzin snuyanzin Nov 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we checking separately existing of keys and non null values only for the sake of different error messages?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I split the condition into separate checks with distinct error messages.

throw new IllegalArgumentException("Each item must contain 'weight' and 'value' keys");
}
validateValue(item.get(VALUE_KEY));
validateWeight(item.get(WEIGHT_KEY));
}

private static void validateValue(Object valueObj) {
if (valueObj == null) {
throw new IllegalArgumentException("Value cannot be null");
}
}

private static void validateWeight(Object weightObj) {
if (!(weightObj instanceof Double weight)) {
throw new IllegalArgumentException("Weight must be a non-null Double");
}
if (weight < 0 || Double.isNaN(weight) || Double.isInfinite(weight)) {
throw new IllegalArgumentException("Weight must be a non-negative number and cannot be NaN or infinite");
}
}

private static void validateTotalWeight(double totalWeight) {
if (totalWeight <= 0) {
throw new IllegalArgumentException("The total weight must be greater than 0. At least one item must have a positive weight");
}
}

static double[] preprocessItems(List<Map<String, Object>> items, Object[] values) {
double[] cumulativeWeights = new double[items.size()];

double totalWeight = 0.0;
for (int i = 0; i < items.size(); i++) {
double weight = (Double) items.get(i).get(WEIGHT_KEY);
if (Double.MAX_VALUE - totalWeight < weight) {
throw new IllegalArgumentException("Sum of the weights exceeds Double.MAX_VALUE");
}
totalWeight += weight;
cumulativeWeights[i] = totalWeight;
values[i] = items.get(i).get(VALUE_KEY);
}

validateTotalWeight(totalWeight);

return cumulativeWeights;
}

static <T> T selectWeightedElement(double randomValue, double[] cumulativeWeights, Object[] values) {
int index = Arrays.binarySearch(cumulativeWeights, randomValue);
index = (index < 0) ? -index - 1 : index;

if (index >= cumulativeWeights.length) {
index = cumulativeWeights.length - 1;
}
Comment on lines +130 to +132
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (index >= cumulativeWeights.length) {
index = cumulativeWeights.length - 1;
}
index = Math.max(index, cumulativeWeights.length - 1);

i think it would be shorter

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Proposed change will always set index to cumulativeWeights.length - 1 if index is less than cumulativeWeights.length - 1, which is not the intended behavior. The intent is to clamp the value only when index is greater than or equal to cumulativeWeights.length


return (T) values[index];
}
}
45 changes: 37 additions & 8 deletions src/test/java/net/datafaker/providers/base/CustomFakerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,18 @@
import org.junit.jupiter.api.Test;

import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;

import net.datafaker.service.WeightedRandomSelector;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

/**
* This is a demo of how to create a custom faker and register a custom faker in it.
* This is a demo of how to create a custom data provider and register a custom faker to use it.
*/
class CustomFakerTest {
public static class MyCustomFaker extends BaseFaker {
Expand All @@ -24,7 +29,14 @@ public InsectFromFile insectFromFile() {
}

public static class Insect extends AbstractProvider<BaseProviders> {
private static final String[] INSECT_NAMES = {"Ant", "Beetle", "Butterfly", "Wasp"};
private static final WeightedRandomSelector selector = new WeightedRandomSelector(new Random());

private static final String[] INSECT_NAMES = { "Ant", "Beetle", "Butterfly", "Wasp" };
private static final List<Map<String, Object>> WEIGHTED_INSECTS = List.of(
Map.of("value", "Driver ant", "weight", 6.0),
Map.of("value", "Fire ant", "weight", 3.0),
Map.of("value", "Harvester ant", "weight", 1.0)
);

public Insect(BaseProviders faker) {
super(faker);
Expand All @@ -33,6 +45,10 @@ public Insect(BaseProviders faker) {
public String nextInsectName() {
return INSECT_NAMES[faker.random().nextInt(INSECT_NAMES.length)];
}

public String weightedInsectName() {
return selector.select(WEIGHTED_INSECTS);
}
}

public static class InsectFromFile extends AbstractProvider<BaseProviders> {
Expand Down Expand Up @@ -107,18 +123,31 @@ class InsectFaker extends BaseFaker {
public InsectFaker(Locale locale) {
super(locale);
}

public Insect insect() {
return getProvider(Insect.class, Insect::new);
}
}
BaseFaker faker1 = new InsectFaker(Locale.ENGLISH);
BaseFaker faker2 = new InsectFaker(Locale.GERMAN);

Insect insect1 = faker1.getProvider("Insect");
Insect insect2 = faker2.getProvider("Insect");
Insect insect1 = faker1.getProvider(Insect.class, Insect::new);
Insect insect2 = faker2.getProvider(Insect.class, Insect::new);
assertThat(insect1).isNotNull();
assertThat(insect2).isNotNull();
assertThat(insect1).isNotSameAs(insect2);
}

@Test
void weightedInsectNameTest() {
MyCustomFaker myFaker = new MyCustomFaker();
Map<String, Integer> insectCounts = new HashMap<>();
insectCounts.put("Driver ant", 0);
insectCounts.put("Fire ant", 0);
insectCounts.put("Harvester ant", 0);

for (int i = 0; i < 100; i++) {
String selectedInsect = myFaker.insect().weightedInsectName();
insectCounts.put(selectedInsect, insectCounts.get(selectedInsect) + 1);
}

assertThat(insectCounts.get("Driver ant")).isGreaterThan(insectCounts.get("Fire ant"));
assertThat(insectCounts.get("Fire ant")).isGreaterThan(insectCounts.get("Harvester ant"));
}
}
Loading