From 9141deb932d6e18988386204943996ac7560c40c Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 14 Apr 2022 22:16:24 +0200 Subject: [PATCH 1/6] added --- doubtlab/reason.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/doubtlab/reason.py b/doubtlab/reason.py index f25a799..ec7076e 100644 --- a/doubtlab/reason.py +++ b/doubtlab/reason.py @@ -255,14 +255,13 @@ def from_proba(proba, y, classes, threshold): assert np.all(predicate == np.array([0.0, 1.0, 1.0])) ``` """ - values = [] - for i, proba in enumerate(proba): - proba_dict = { - classes[j]: v for j, v in enumerate(proba) if classes[j] != y[i] - } - values.append(max(proba_dict.values())) - confidences = np.array(values) - return (confidences > threshold).astype(np.float16) + mapper = {k: i for i, k in enumerate(classes)} + y_int = np.array([mapper[k] for k in y]) + confidences = proba.copy() + # Advanced indexing trick: + # https://numpy.org/doc/stable/user/basics.indexing.html#integer-array-indexing + confidences[np.arange(proba.shape[0]), y_int] = 0 + return (confidences.max(axis=1) > threshold).astype(np.float16) def __call__(self, X, y): probas = self.model.predict_proba(X) @@ -377,14 +376,11 @@ def from_proba(proba, y, classes, threshold=0.2): assert np.all(predicate == np.array([0.0, 0.0, 1.0])) ``` """ - values = [] - for i, p in enumerate(proba): - true_label = y[i] - proba_dict = { - classes[j]: v for j, v in enumerate(p) if true_label == classes[j] - } - values.append(proba_dict[true_label]) - confidences = np.array(values) + mapper = {k: i for i, k in enumerate(classes)} + y_int = np.array([mapper[k] for k in y]) + # Advanced indexing trick: + # https://numpy.org/doc/stable/user/basics.indexing.html#integer-array-indexing + confidences = proba[np.arange(proba.shape[0]), y_int] return (confidences < threshold).astype(np.float16) def __call__(self, X, y): From fefc32f1536d0dfbf414d0ba204c3615beee0f33 Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 14 Apr 2022 22:21:09 +0200 Subject: [PATCH 2/6] added-a-test --- tests/test_reason/test_shortconfreason.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_reason/test_shortconfreason.py b/tests/test_reason/test_shortconfreason.py index 7d97f2f..2b9ac08 100644 --- a/tests/test_reason/test_shortconfreason.py +++ b/tests/test_reason/test_shortconfreason.py @@ -16,6 +16,20 @@ def test_short_conf_probas(): assert np.all(predicate == np.array([0.0, 1.0])) +def test_short_conf_probas_bigger(): + """ + Test `from_probas` on an bigger obvious example. + """ + probas = np.array([[0.5, 0.5, 0.0], [0.3, 0.3, 0.4], [0.65, 0.15, 0.3]]) + y = np.array([1, 2, 0]) + classes = np.array([0, 1, 2]) + threshold = 0.6 + predicate = ShortConfidenceReason.from_proba( + proba=probas, y=y, classes=classes, threshold=threshold + ) + assert np.all(predicate == np.array([1.0, 1.0, 0.0])) + + def test_short_conf_non_numeric(): """ Test `from_probas` on an obvious example. From e101c55c2d1e1de9e26e6e84069226d2625a12bd Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 14 Apr 2022 22:34:00 +0200 Subject: [PATCH 3/6] added-another-test --- tests/test_reason/test_longconfreason.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_reason/test_longconfreason.py b/tests/test_reason/test_longconfreason.py index d9d9f73..f877b94 100644 --- a/tests/test_reason/test_longconfreason.py +++ b/tests/test_reason/test_longconfreason.py @@ -24,3 +24,15 @@ def test_longconf_proba_nonnumeric(): proba=probas, y=y, classes=classes, threshold=threshold ) assert np.all(predicate == np.array([0.0, 1.0])) + + +def test_longconf_proba_bigger_nonnumeric(): + """Test from_probas on a bigger obvious example.""" + probas = np.array([[0.9, 0.1], [0.5, 0.5], [0.1, 0.9], [0.2, 0.8]]) + y = np.array(["a", "b", "a", "b"]) + classes = np.array(["a", "b"]) + threshold = 0.4 + predicate = LongConfidenceReason.from_proba( + proba=probas, y=y, classes=classes, threshold=threshold + ) + assert np.all(predicate == np.array([0.0, 1.0, 1.0, 0.0])) From a2061bbe02a777d615ae26fe1c43c6c0675f420c Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 14 Apr 2022 22:37:53 +0200 Subject: [PATCH 4/6] update --- docs/quickstart/faq.md | 50 -------------------------------------- docs/quickstart/index.md | 52 ++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 3 files changed, 53 insertions(+), 51 deletions(-) diff --git a/docs/quickstart/faq.md b/docs/quickstart/faq.md index 1dbbc41..b7a6ede 100644 --- a/docs/quickstart/faq.md +++ b/docs/quickstart/faq.md @@ -17,53 +17,3 @@ ensemble = DoubtEnsemble( Note that you can also add another reason for `nan` values that appear in `X`. - -## How do I prevent models from re-computing? - -Suppose you have a setup that looks something like: - -```python -from doubtlab.ensemble import DoubtEnsemble -from doubtlab.reason import ProbaReason, ShortConfidenceReason, LongConfidenceReason - -# Suppose this dataset is very big and that this computation is heavy. -X, y = load_big_dataset() -model = LogisticRegression(max_iter=1_000) -model.fit(X, y) - -# This step might be expensive because internally we will be calling -# `model.predict_proba(X)` a lot! -ensemble = DoubtEnsemble( - proba=ProbaReason(model) - short=ShortConfidenceReason(model), - long=LongConfidenceReason(model) -) -``` - -Then you might wonder if we're able to speed things up by precomputing our -`.predict_proba()`-values. You could use `lambda`s, but you can also use -common utility methods that have been added to the reason classes. Most of -our reasons implement a `from_pred` or `from_proba` method that you can use. -See the [API](https://koaning.github.io/doubtlab/api/reasons/) for more details. - -That way, we can rewrite the code for a speedup. - -```python -from doubtlab.ensemble import DoubtEnsemble -from doubtlab.reason import ProbaReason, ShortConfidenceReason, LongConfidenceReason - -# Suppose this dataset is very big and that this computation is heavy. -X, y = load_big_dataset() -model = LogisticRegression(max_iter=1_000) -model.fit(X, y) - -# Let's precalculate the proba values. -probas = model.predict_proba(X) - -# We can re-use the probas below. Note that some reasons require extra information. -ensemble = DoubtEnsemble( - proba=ProbaReason.from_proba(probas) - short=ShortConfidenceReason.from_proba(probas, y, classes=["pos", "neg"], threshold=0.2), - long=LongConfidenceReason.from_proba(probas, y, classes=["pos", "neg"], threshold=0.4) -) -``` diff --git a/docs/quickstart/index.md b/docs/quickstart/index.md index 5d6812d..3a94622 100644 --- a/docs/quickstart/index.md +++ b/docs/quickstart/index.md @@ -142,6 +142,58 @@ it may be a problem for your dataset as well. The hope is that this library makes it just a bit easier for folks do to check their datasets for bad labels. It's an exercise worth doing and the author of this library would love to hear anekdotes. +## Does this scale? + +You might be dealing with a large dataset, in which case you may want to +be concious of compute time. Suppose you have a setup that looks something like: + +```python +from doubtlab.ensemble import DoubtEnsemble +from doubtlab.reason import ProbaReason, ShortConfidenceReason, LongConfidenceReason + +# Suppose this dataset is very big and that this computation is heavy. +X, y = load_big_dataset() +model = LogisticRegression(max_iter=1_000) +model.fit(X, y) + +# This step might be expensive because internally we will be calling +# `model.predict_proba(X)` a lot! +ensemble = DoubtEnsemble( + proba=ProbaReason(model) + short=ShortConfidenceReason(model), + long=LongConfidenceReason(model) +) +``` + +Then you might wonder if we're able to speed things up by precomputing our +`.predict_proba()`-values. You could use `lambda`s, but you can also use +common utility methods that have been added to the reason classes. Most of +our reasons implement a `from_pred` or `from_proba` method that you can use. +See the [API](https://koaning.github.io/doubtlab/api/reasons/) for more details. + +That way, we can rewrite the code for a speedup. + +```python +from doubtlab.ensemble import DoubtEnsemble +from doubtlab.reason import ProbaReason, ShortConfidenceReason, LongConfidenceReason + +# Suppose this dataset is very big and that this computation is heavy. +X, y = load_big_dataset() +model = LogisticRegression(max_iter=1_000) +model.fit(X, y) + +# Let's precalculate the proba values. +probas = model.predict_proba(X) + +# We can re-use the probas below. Note that some reasons require extra information. +ensemble = DoubtEnsemble( + proba=ProbaReason.from_proba(probas) + short=ShortConfidenceReason.from_proba(probas, y, classes=["pos", "neg"], threshold=0.2), + long=LongConfidenceReason.from_proba(probas, y, classes=["pos", "neg"], threshold=0.4) +) +``` + + ## Next Steps You may get some more inspiration by checking some of the examples of this library. diff --git a/setup.py b/setup.py index e9c7684..45b8055 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ setup( name="doubtlab", - version="0.2.1", + version="0.2.2", author="Vincent D. Warmerdam", packages=find_packages(exclude=["notebooks", "docs"]), description="Don't Blindly Trust Your Labels", From e3c9145fffe42f6c7b54b265ac12607283121859 Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 14 Apr 2022 22:39:07 +0200 Subject: [PATCH 5/6] update-index --- docs/index.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 3ab6f85..a3cb3f7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -32,4 +32,5 @@ If you want to get started, we recommend starting [here](./quickstart/). ## Related Projects - The [cleanlab](https://github.com/cleanlab/cleanlab) project was an inspiration for this one. They have a great heuristic for bad label detection but I wanted to have a library that implements many. Be sure to check out their work on the [labelerrors.com](https://labelerrors.com) project. -- My employer, [Rasa](https://rasa.com/), has always had a focus on data quality. Some of that attitude is bound to have seeped in here. Be sure to check the [Conversation Driven Development](https://rasa.com/docs/rasa/conversation-driven-development/) approach and [Rasa X](https://rasa.com/docs/rasa-x/) if you're working on virtual assistants. +- My former employer, [Rasa](https://rasa.com/), has always had a focus on data quality. Some of that attitude is bound to have seeped in here. Be sure to check the [Conversation Driven Development](https://rasa.com/docs/rasa/conversation-driven-development/) approach and [Rasa X](https://rasa.com/docs/rasa-x/) if you're working on virtual assistants. +- My current employer, [Explosion](https://explosion.ai/), has a neat labelling tool called [prodigy](https://prodi.gy). I'm currently investigating how tools like doubtlab might lead to better labels when combined with this (very like-able) annotation tool. From 40d7a72182c366cf83905f84cc84f6b80a5be5e9 Mon Sep 17 00:00:00 2001 From: Vincent Date: Thu, 14 Apr 2022 22:42:24 +0200 Subject: [PATCH 6/6] ignore-index --- tests/test_docs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index 4aa6790..86ce2e4 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -59,7 +59,6 @@ def test_function_docstrings(func): "fpath", [ "README.md", - "docs/quickstart/index.md", "docs/quickstart/benchmarks.md", "docs/examples/google-emotions.md", ],