From a0f709b26a3a026abb48b21bdf5fbf547120757f Mon Sep 17 00:00:00 2001 From: Umberto Lupo <46537483+ulupo@users.noreply.github.com> Date: Tue, 2 Jun 2020 08:39:59 +0200 Subject: [PATCH] Create giotto-tda v0.2.2 (#414) * Fix issue with docstring example and document reshaping of 1D outputs (#396) * Toc (#394) * Update P landscapes Signed-off-by: ammedmar * Add distances, inner products and kernels glossary entry Signed-off-by: ammedmar * Remake vectorization changes Signed-off-by: ammedmar * Change [] for \lbrack \rbrack Signed-off-by: ammedmar * Update after W's comments Signed-off-by: ammedmar * Update after W's comments Signed-off-by: ammedmar * Update afte W's second comments Signed-off-by: ammedmar * Update after Umbe's comments Signed-off-by: ammedmar * Update after Umbe's second comments Signed-off-by: ammedmar * Update after Umbe's third comments Signed-off-by: ammedmar * Update after Umbe's 4th comments Signed-off-by: ammedmar * Remove concept k-skeleton Signed-off-by: ammedmar * Add table of content Signed-off-by: ammedmar * Add Lp & lp. Update landscape Signed-off-by: ammedmar * Update TOC indentation Signed-off-by: ammedmar * Add heat vectorizations entry Signed-off-by: ammedmar * Update indentation of TOC Signed-off-by: ammedmar * Fix extra spacing in bibliography Signed-off-by: ammedmar * Update bibliography hack for caps Signed-off-by: ammedmar * After Umbe's Comments Signed-off-by: ammedmar * Update after issue #398 Signed-off-by: ammedmar * Update after Umbe's comments Signed-off-by: ammedmar * Update after Umbe's 2nd comments Signed-off-by: ammedmar Co-authored-by: ammedmar * Speedup windows pipeline (#402) * Improve boost location for azure pipeline on windows The boost version installed in the pipeline is now used Signed-off-by: julian Co-authored-by: Umberto Lupo <46537483+ulupo@users.noreply.github.com> * Make bindings public (#395) * Make bindings public Signed-off-by: Guillaume Tauzin * Fix pipeline on Mac (#407) * Refresh ccache * Enforce CXX standard to 14 on each module * Change variable name to comply with E741 Signed-off-by: julian Co-authored-by: Umberto Lupo <46537483+ulupo@users.noreply.github.com> * Mapper visualisation refactor: fix bugs, add summary statistics in hovertext, improve opacity, remove matplotlib dependency, add node_scale kwarg, add clone_pipeline kwarg to interactive plots,restructure/rename plotly_kwargs, improve code (#406) * Refactor of `mapper/visualisation.py` and `mapper/utils/visualisation.py` - Removal of color scaling - More modularity - Variable and function name changes - Remove matplotlib functions used for hoverlabel background color - Remove cmin and cmax - Change return signature of some functions * Add test-output.xml to .gitignore * Replace "text" key in plot_options["node_trace"] with "hovertext" * Improve width and opacity of nodes and edges * Display summary statistics in hovertext with significant figure rounding via new n_sig_figs kwarg * Rename plotly_kwargs to plotly_params * Remove matplotlib installation requirement * Fix mapper notebook * Add small comment on use of matplotlib in voids_on_the_plane * Place matplotlib in examples requirement in setup.py * Improve docstring of make_mapper_pipeline * Improve examples for make_mapper_pipeline and create one for plot_static_mapper_graph * Hide set_node_sizeref, add node_scale kwarg, add clone_pipeline kwarg to plot_interactive_mapper_graph - Update docstrings - Update quickstart notebook * Add helper function for colorscale-based interpolations, improve use of plotly_params Static and interactive plots display the expect hoverlabel colors in 3D, or white if things go wrong. Thanks to @lewtun for pointing out that this was needed. * Improve docstrings * Hide visualization module in mapper/utils * Add pip install matplotlib to notebook tests in manylinux job (#410) * Fix mapper docstring issues following #406 (#411) * Fix issues with mapper docs following #406 * Create giotto-tda version 0.2.2 (#413) * Turn CODE_OWNERS and CODE_AUTHORS into an rst file * Bump version number to 0.2.2 * Add release notes for v0.2.2 Co-authored-by: Anibal M. Medina-Mardones Co-authored-by: ammedmar Co-authored-by: REDS institute Co-authored-by: Guillaume Tauzin --- .azure-ci/install_boost.py | 23 - .gitignore | 7 +- CMakeLists.txt | 20 +- CODE_AUTHORS | 14 - CODE_AUTHORS.rst | 15 + CODE_OWNERS | 7 - CODE_OWNERS.rst | 5 + README.rst | 1 - azure-pipelines.yml | 12 +- cmake/HelperBoost.cmake | 12 + doc/installation.rst | 1 - doc/library.rst | 4 +- doc/release.rst | 62 ++- doc/theory/bibliography.bib | 18 + doc/theory/glossary.tex | 335 +++++++++------ examples/mapper_quickstart.ipynb | 68 ++- examples/voids_on_the_plane.ipynb | 2 +- gtda/_version.py | 2 +- gtda/externals/__init__.py | 19 + gtda/mapper/pipeline.py | 59 +-- gtda/mapper/tests/test_visualization.py | 22 +- gtda/mapper/utils/_visualization.py | 396 +++++++++++++++++ gtda/mapper/utils/decorators.py | 26 +- gtda/mapper/utils/visualization.py | 327 -------------- gtda/mapper/visualization.py | 541 ++++++++++++++++-------- requirements.txt | 1 - setup.py | 5 +- 27 files changed, 1208 insertions(+), 796 deletions(-) delete mode 100644 .azure-ci/install_boost.py delete mode 100644 CODE_AUTHORS create mode 100644 CODE_AUTHORS.rst delete mode 100644 CODE_OWNERS create mode 100644 CODE_OWNERS.rst create mode 100644 gtda/mapper/utils/_visualization.py delete mode 100644 gtda/mapper/utils/visualization.py diff --git a/.azure-ci/install_boost.py b/.azure-ci/install_boost.py deleted file mode 100644 index 1f21a74c4..000000000 --- a/.azure-ci/install_boost.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python - -import os -from pathlib import Path -import urllib.request -import shutil -import zipfile - - -url = "https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.zip" -boost_folder = r"C:\local" - -Path(boost_folder).mkdir(parents=True, exist_ok=True) -zip_file = os.path.join(boost_folder, "1_72_0.zip") - -with urllib.request.urlopen(url) as response, \ - open(zip_file, 'wb') as out_file: - shutil.copyfileobj(response, out_file) - -with zipfile.ZipFile(zip_file, 'r') as zip_ref: - zip_ref.extractall(boost_folder) - -os.remove(zip_file) diff --git a/.gitignore b/.gitignore index 24cf51bd3..ea2d9f47f 100644 --- a/.gitignore +++ b/.gitignore @@ -42,6 +42,9 @@ gtda/externals/pybind11 .pytest_cache/ .hypothesis/ +# Pytest output files +test-output.xml + # Latex *.aux *.bbl @@ -50,7 +53,5 @@ gtda/externals/pybind11 *.log *.pdf *.synctex.gz +*.toc doc/theory/glossary_backup - -# development -tda/mapper/scratch/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 062b89a0b..eccc4b2b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,11 +18,13 @@ set(HERA_DIR "gtda/externals/hera") ####################################################################### pybind11_add_module(gtda_ripser "${BINDINGS_DIR}/ripser_bindings.cpp") +set_property(TARGET gtda_ripser PROPERTY CXX_STANDARD 14) + if(OpenMP_FOUND) target_link_libraries(gtda_ripser PRIVATE OpenMP::OpenMP_CXX) endif() -target_compile_definitions(gtda_ripser PRIVATE ASSEMBLE_REDUCTION_MATRIX=1) +target_compile_definitions(gtda_ripser PRIVATE ASSEMBLE_REDUCTION_MATRIX=1) target_include_directories(gtda_ripser PRIVATE "${RIPSER_SRC_DIR}/ripser") if(MSVC) @@ -38,11 +40,13 @@ endif() ####################################################################### pybind11_add_module(gtda_ripser_coeff "${BINDINGS_DIR}/ripser_bindings.cpp") +set_property(TARGET gtda_ripser_coeff PROPERTY CXX_STANDARD 14) + if(OpenMP_FOUND) target_link_libraries(gtda_ripser_coeff PRIVATE OpenMP::OpenMP_CXX) endif() -target_compile_definitions(gtda_ripser_coeff PRIVATE USE_COEFFICIENTS=1 ASSEMBLE_REDUCTION_MATRIX=1) +target_compile_definitions(gtda_ripser_coeff PRIVATE USE_COEFFICIENTS=1 ASSEMBLE_REDUCTION_MATRIX=1) target_include_directories(gtda_ripser_coeff PRIVATE "${RIPSER_SRC_DIR}/ripser") if(MSVC) @@ -58,6 +62,8 @@ endif() ####################################################################### pybind11_add_module(gtda_wasserstein ${BINDINGS_DIR}/wasserstein_bindings.cpp) +set_property(TARGET gtda_wasserstein PROPERTY CXX_STANDARD 14) + target_link_libraries(gtda_wasserstein LINK_PUBLIC ${Boost_LIBRARIES}) target_compile_definitions(gtda_wasserstein PRIVATE BOOST_RESULT_OF_USE_DECLTYPE=1 BOOST_ALL_NO_LIB=1 BOOST_SYSTEM_NO_DEPRECATED=1) @@ -77,6 +83,8 @@ endif() ####################################################################### pybind11_add_module(gtda_bottleneck "${BINDINGS_DIR}/bottleneck_bindings.cpp") +set_property(TARGET gtda_bottleneck PROPERTY CXX_STANDARD 14) + target_link_libraries(gtda_bottleneck LINK_PUBLIC ${Boost_LIBRARIES}) target_compile_definitions(gtda_bottleneck PRIVATE BOOST_RESULT_OF_USE_DECLTYPE=1 BOOST_ALL_NO_LIB=1 BOOST_SYSTEM_NO_DEPRECATED=1) @@ -96,6 +104,7 @@ endif() ####################################################################### pybind11_add_module(gtda_cubical_complex "${BINDINGS_DIR}/cubical_complex_bindings.cpp") +set_property(TARGET gtda_cubical_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) target_link_libraries(gtda_cubical_complex PRIVATE OpenMP::OpenMP_CXX) @@ -120,6 +129,7 @@ endif() ####################################################################### pybind11_add_module(gtda_persistent_cohomology "${BINDINGS_DIR}/persistent_cohomology_bindings.cpp") +set_property(TARGET gtda_persistent_cohomology PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) target_link_libraries(gtda_persistent_cohomology PRIVATE OpenMP::OpenMP_CXX) @@ -146,6 +156,7 @@ endif() ####################################################################### pybind11_add_module(gtda_simplex_tree "${BINDINGS_DIR}/simplex_tree_bindings.cpp") +set_property(TARGET gtda_simplex_tree PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) target_link_libraries(gtda_simplex_tree PRIVATE OpenMP::OpenMP_CXX) @@ -174,6 +185,7 @@ endif() ####################################################################### pybind11_add_module(gtda_periodic_cubical_complex "${BINDINGS_DIR}/periodic_cubical_complex_bindings.cpp") +set_property(TARGET gtda_periodic_cubical_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) target_link_libraries(gtda_periodic_cubical_complex PRIVATE OpenMP::OpenMP_CXX) @@ -200,6 +212,7 @@ endif() ####################################################################### pybind11_add_module(gtda_witness_complex "${BINDINGS_DIR}/witness_complex_bindings.cpp") +set_property(TARGET gtda_witness_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) target_link_libraries(gtda_witness_complex PRIVATE OpenMP::OpenMP_CXX) @@ -228,6 +241,7 @@ endif() ####################################################################### pybind11_add_module(gtda_strong_witness_complex "${BINDINGS_DIR}/strong_witness_complex_bindings.cpp") +set_property(TARGET gtda_strong_witness_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) target_link_libraries(gtda_strong_witness_complex PRIVATE OpenMP::OpenMP_CXX) @@ -256,6 +270,7 @@ endif() ####################################################################### pybind11_add_module(gtda_sparse_rips_complex "${BINDINGS_DIR}/rips_complex_bindings.cpp") +set_property(TARGET gtda_sparse_rips_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) target_link_libraries(gtda_sparse_rips_complex PRIVATE OpenMP::OpenMP_CXX) @@ -285,6 +300,7 @@ endif() ####################################################################### pybind11_add_module(gtda_cech_complex "${BINDINGS_DIR}/cech_complex_bindings.cpp") +set_property(TARGET gtda_cech_complex PROPERTY CXX_STANDARD 14) if(OpenMP_FOUND) target_link_libraries(gtda_cech_complex PRIVATE OpenMP::OpenMP_CXX) diff --git a/CODE_AUTHORS b/CODE_AUTHORS deleted file mode 100644 index 9a6bb4bec..000000000 --- a/CODE_AUTHORS +++ /dev/null @@ -1,14 +0,0 @@ -# The following is the list of the code authors of the giotto-tda python -# package. Where component authors are known, add them here. - -Guillaume Tauzin, guillaume.tauzin@epfl.ch -Umberto Lupo, u.lupo@l2f.ch -Lewis Tunstall, l.tunstall@l2f.ch -Matteo Caorsi, m.caorsi@l2f.ch -Philippe Nguyen, p.nguyen@l2f.ch -Julian Burella Pérez, julian.burellaperez@heig-vd.ch -Alessio Ghiraldello, amg28@protonmail.com -Adélie Garin, adelie.garin@epfl.ch -Anibal Medina-Mardones, anibal.medinamardones@epfl.ch -Wojciech Reise, reisewojciech@gmail.com -Roman Yurchak, roman.yurchak@symerio.com diff --git a/CODE_AUTHORS.rst b/CODE_AUTHORS.rst new file mode 100644 index 000000000..d8440eda8 --- /dev/null +++ b/CODE_AUTHORS.rst @@ -0,0 +1,15 @@ +The following is the list of code authors of the ``giotto-tda`` python package. + +Where component authors are known, add them here. + +| Guillaume Tauzin, guillaume.tauzin@epfl.ch +| Umberto Lupo, u.lupo@l2f.ch +| Lewis Tunstall, l.tunstall@l2f.ch +| Matteo Caorsi, m.caorsi@l2f.ch +| Philippe Nguyen, p.nguyen@l2f.ch +| Julian Burella Pérez, julian.burellaperez@heig-vd.ch +| Alessio Ghiraldello, amg28@protonmail.com +| Adélie Garin, adelie.garin@epfl.ch +| Anibal Medina-Mardones, anibal.medinamardones@epfl.ch +| Wojciech Reise, reisewojciech@gmail.com +| Roman Yurchak, roman.yurchak@symerio.com diff --git a/CODE_OWNERS b/CODE_OWNERS deleted file mode 100644 index 8cce5d770..000000000 --- a/CODE_OWNERS +++ /dev/null @@ -1,7 +0,0 @@ - -# The following is the list of the code owners of the giotto-tda python -# package. - -L2F SA -EPFL - Ecole Polytechnique Fédérale de Lausanne -REDS Institute of the Haut Ecole d'Ingénierie et Gestion du canton Vaud diff --git a/CODE_OWNERS.rst b/CODE_OWNERS.rst new file mode 100644 index 000000000..0d9b1d2fb --- /dev/null +++ b/CODE_OWNERS.rst @@ -0,0 +1,5 @@ +The following is the list of code owners of the ``giotto-tda`` Python package: + +- L2F SA +- EPFL - Ecole Polytechnique Fédérale de Lausanne +- REDS Institute of the Haut Ecole d'Ingénierie et Gestion du canton Vaud diff --git a/README.rst b/README.rst index cea8acc71..9a36600e8 100644 --- a/README.rst +++ b/README.rst @@ -68,7 +68,6 @@ The latest stable version of ``giotto-tda`` requires: - joblib (>= 0.13) - scikit-learn (>= 0.22.0) - python-igraph (>= 0.7.1.post6) -- matplotlib (>= 3.0.3) - plotly (>= 4.4.1) - ipywidgets (>= 7.5.1) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 881f05b33..e6afc9795 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -38,7 +38,7 @@ jobs: - task: Cache@2 inputs: - key: '"ccache-wheels-v2020.04.07" | $(Agent.OS) | "$(python.version)"' + key: '"ccache-wheels-v2020.05.12" | $(Agent.OS) | "$(python.version)"' path: $(CCACHE_DIR) displayName: ccache @@ -73,7 +73,7 @@ jobs: - script: | set -e - pip install pandas openml + pip install pandas openml matplotlib pip install "papermill==1.2.1" cd examples for n in *.ipynb @@ -133,7 +133,7 @@ jobs: - task: Cache@2 inputs: - key: '"ccache-v2020.04.07" | $(Agent.OS) | "$(python.version)"' + key: '"ccache-v2020.05.12" | $(Agent.OS) | "$(python.version)"' path: $(CCACHE_DIR) displayName: ccache @@ -246,9 +246,11 @@ jobs: condition: eq(variables['nightly_check'], 'true') displayName: 'Change name to giotto-tda-nightly' + # Set BOOST_ROOT_PIPELINE to the version used in the pipeline + # See https://github.com/actions/virtual-environments/issues/687#issuecomment-616345933 - script: | - python .azure-ci/install_boost.py || exit /b - displayName: 'Install boost' + echo "##vso[task.setvariable variable=BOOST_ROOT_PIPELINE]%BOOST_ROOT_1_72_0%" + displayName: 'Set env variable for boost version' - script: | python -m pip install --upgrade pip setuptools diff --git a/cmake/HelperBoost.cmake b/cmake/HelperBoost.cmake index 57fcee75f..d3f19b12e 100644 --- a/cmake/HelperBoost.cmake +++ b/cmake/HelperBoost.cmake @@ -7,6 +7,18 @@ if(WIN32) list(APPEND BOOST_ROOT "") # Add custom path to your boost installation endif() +# Changes introduced in latest version of the giotto-tda Azure pipelines produce +# compilation errors because CMake cannot find boost header files. +# After discussing in https://github.com/actions/virtual-environments/issues/687 +# this solution is used due to custom paths in the Azure setup. +message(STATUS "BOOST_ROOT_PIPELINE: $ENV{BOOST_ROOT_PIPELINE}") +if(DEFINED ENV{BOOST_ROOT_PIPELINE}) + file(TO_CMAKE_PATH $ENV{BOOST_ROOT_PIPELINE} CMAKE_BOOST_ROOT) + list(APPEND BOOST_ROOT "${CMAKE_BOOST_ROOT}") + list(APPEND BOOST_INCLUDEDIR "${CMAKE_BOOST_ROOT}/boost/include") + list(APPEND BOOST_LIBRARYDIR "${CMAKE_BOOST_ROOT}/lib") +endif() + message(STATUS "BOOST_ROOT: ${BOOST_ROOT}") find_package(Boost 1.56 REQUIRED) diff --git a/doc/installation.rst b/doc/installation.rst index c8137823c..dfcc3fb37 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -16,7 +16,6 @@ The latest stable version of giotto-tda requires: - joblib (>= 0.13) - scikit-learn (>= 0.22.0) - python-igraph (>= 0.7.1.post6) -- matplotlib (>= 3.0.3) - plotly (>= 4.4.1) - ipywidgets (>= 7.5.1) diff --git a/doc/library.rst b/doc/library.rst index 7c73509f4..19787b844 100644 --- a/doc/library.rst +++ b/doc/library.rst @@ -114,5 +114,5 @@ What's new .. include:: release.rst - :start-after: Release 0.2.1 - :end-before: Release 0.2.0 + :start-after: Release 0.2.2 + :end-before: Release 0.2.1 diff --git a/doc/release.rst b/doc/release.rst index 43ebed123..cbe237c3e 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -5,6 +5,65 @@ Release Notes .. _stable: +************* +Release 0.2.2 +************* + +Major Features and Improvements +=============================== + +- The documentation for ``gtda.mapper.utils.decorators.method_to_transform`` has been improved. +- A table of contents has been added to the theory glossary. +- The theory glossary has been restructured by including a section titled "Analysis". Entries for l^p norms, L^p norms and heat vectorization have been added. +- The project's Azure CI for Windows versions has been sped-up by ensuring that the locally installed boost version is detected. +- Several python bindings to external code from GUDHI, ripser.py and Hera have been made public: specifically, ``from gtda.externals import *`` now gives power users access to: + + - ``bottleneck_distance``, + - ``wasserstein_distance``, + - ``ripser``, + - ``SparseRipsComplex``, + - ``CechComplex``, + - ``CubicalComplex``, + - ``PeriodicCubicalComplex``, + - ``SimplexTree``, + - ``WitnessComplex``, + - ``StrongWitnessComplex``. + + However, these functionalities are still undocumented. +- The ``gtda.mapper.visualisation`` and ``gtda.mapper.utils._visualisation`` modules have been thoroughly refactored to improve code clarity, add functionality, change behaviour and fix bugs. Specifically, in figures generated by both ``plot_static_mapper_graph`` and ``plot_interactive_mapper_graph``: + + - The colorbar no longer shows values rescaled to the interval [0, 1]. Instead, it always shows the true range of node summary statistics. + - The values of the node summary statistics are now displayed in the hovertext boxes. A a new keyword argument ``n_sig_figs`` controls their rounding (3 is the default). + - ``plotly_kwargs`` has been renamed to ``plotly_params`` (see "Backwards-Incompatible Changes" below). + - The dependency on ``matplotlib``'s ``rgb2hex`` and ``get_cmap`` functions has been removed. As no other component in ``giotto-tda`` required ``matplotlib``, the dependency on this library has been removed completely. + - A ``node_scale`` keyword argument has been added which can be used to controls the size of nodes (see "Backwards-Incompatible Changes" below). + - The overall look of Mapper graphs has been improved by increasing the opacity of node colors so that edges do not hide them, and by reducing the thickness of marker lines. + + Furthermore, a ``clone_pipeline`` keyword argument has been added to ``plot_interactive_mapper_graph``, which when set to ``False`` allows the user to mutate the input pipeline via the interactive widget. + +- The docstrings of ``plot_static_mapper_graph``, ``plot_interactive_mapper_graph`` and ``make_mapper_pipeline`` have been improved. + +Bug Fixes +========= + +- A CI bug introduced by an update to the XCode compiler installed on the Azure Mac machines has been fixed. +- A bug afflicting Mapper colors, which was due to an incorrect rescaling to [0, 1], has been fixed. + +Backwards-Incompatible Changes +============================== + +- The keyword parameter ``plotly_kwargs`` in ``plot_static_mapper_graph`` and ``plot_interactive_mapper_graph`` has been renamed to ``plotly_params`` and has now slightly different specifications. A new logic controls how the information contained in ``plotly_params`` is used to update plotly figures. +- The function ``get_node_sizeref`` in ``gtda.mapper.utils.visualization`` has been hidden by renaming it to ``_get_node_sizeref``. Its main intended use is subsumed by the new ``node_scale`` parameter of ``plot_static_mapper_graph`` and ``plot_interactive_mapper_graph``. + +Thanks to our Contributors +========================== + +This release contains contributions from many people: + +Umberto Lupo, Julian Burella Pérez, Anibal Medina-Mardones, Wojciech Reise and Guillaume Tauzin. + +We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of inspiring discussions. + ************* Release 0.2.1 ************* @@ -41,8 +100,7 @@ This release contains contributions from many people: Umberto Lupo, Anibal Medina-Mardones, Julian Burella Pérez, Guillaume Tauzin, and Wojciech Reise. -We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of -inspiring discussions. +We are also grateful to all who filed issues or helped resolve them, asked and answered questions, and were part of inspiring discussions. ************* Release 0.2.0 diff --git a/doc/theory/bibliography.bib b/doc/theory/bibliography.bib index 13a50369c..571cbb9e1 100644 --- a/doc/theory/bibliography.bib +++ b/doc/theory/bibliography.bib @@ -7,6 +7,24 @@ @incollection{takens1981detecting publisher={Springer} } +@article{adams2017persistence, + title={Persistence images: A stable vector representation of persistent homology}, + author={Adams, Henry and Emerson, Tegan and Kirby, Michael and Neville, Rachel and Peterson, Chris and Shipman, Patrick and Chepushtanova, Sofya and Hanson, Eric and Motta, Francis and Ziegelmeier, Lori}, + journal={The Journal of Machine Learning Research}, + volume={18}, + number={1}, + pages={218--252}, + year={2017}, + publisher={JMLR. org} +} + +@inproceedings{reininghaus2015stable, + title={A stable multi-scale kernel for topological machine learning}, + author={Reininghaus, Jan and Huber, Stefan and Bauer, Ulrich and Kwitt, Roland}, + booktitle={Proceedings of the {IEEE} conference on computer vision and pattern recognition}, + pages={4741--4748}, + year={2015} +} @inproceedings{chazal2014stochastic, address = {Kyoto, Japan}, diff --git a/doc/theory/glossary.tex b/doc/theory/glossary.tex index 0a9c8f518..8b69e415f 100644 --- a/doc/theory/glossary.tex +++ b/doc/theory/glossary.tex @@ -10,16 +10,24 @@ colorlinks=true, pagebackref=true, pdfpagelabels]{hyperref} \hypersetup{ + linktocpage, colorlinks, citecolor=blue, linkcolor=blue, urlcolor=blue} +\setcounter{tocdepth}{2} +\makeatletter +\def\l@subsection{\@tocline{2}{0pt}{3.5pc}{5pc}{}} +\makeatother + \begin{document} \title{Theory Glossary} \maketitle + \tableofcontents + \section{Symbols} \begin{tabular}{ l l} @@ -33,6 +41,154 @@ multiset $ \lbrace (s, s) \mid s \in \mathbb{R} \rbrace $ with multiplicity $ ( s,s ) \mapsto +\infty$. \end{tabular} + \section{Analysis} + + \subsection*{Metric space} \label{metric_space} + A set $X$ with a function + \begin{equation*} + d : X \times X \to \mathbb R + \end{equation*} + is said to be a \textit{metric space} if the values of $d$ are all non-negative and for all $x,y,z \in X$ + \begin{equation*} + d(x,y) = 0\ \Leftrightarrow\ x = y + \end{equation*} + \begin{equation*} + d(x,y) = d(y,x) + \end{equation*} + \begin{equation*} + d(x,z) \leq d(x,y) + d(y, z). + \end{equation*} + In this case the $d$ is referred to as the \textit{metric} or the \textit{distance function}. + + \subsection*{Normed space} \label{normed_space} + A vector space $V$ together with a function + \begin{equation*} + ||-|| : V \to \mathbb R + \end{equation*} + is said to be an \textit{normed space} if the values of $||-||$ are all non-negative and for all $u,v \in V$ and $a \in \mathbb R$ + \begin{equation*} + ||v|| = 0\ \Leftrightarrow\ u = 0 + \end{equation*} + \begin{equation*} + ||a u || = |a|\, ||u|| + \end{equation*} + \begin{equation*} + ||u+v|| = ||u|| + ||v||. + \end{equation*} + The function $||-||$ is referred to as the \textit{norm}. + + A normed space is naturally a + %\hyperref[metric_space]{metric space} + metric space with distance function + \begin{equation*} + d(u,v) = ||u-v||. + \end{equation*} + + \subsection*{Inner product space} \label{inner_product_space} + + A vector space $V$ together with a function + \begin{equation*} + \langle -, - \rangle : V \times V \to \mathbb R + \end{equation*} + is said to be an \textit{inner product space} if for all $u,v,w \in V$ and $a \in \mathbb R$ + \begin{equation*} + u \neq 0\ \Rightarrow\ \langle u, u \rangle > 0 + \end{equation*} + \begin{equation*} + \langle u, v\rangle = \langle v, u\rangle + \end{equation*} + \begin{equation*} + \langle au+v, w \rangle = a\langle u, w \rangle + \langle v, w \rangle. + \end{equation*} + The function $\langle -, - \rangle$ is referred to as the \textit{inner product}. + + An inner product space is naturally a normed space with + \begin{equation*} + ||u|| = \sqrt{\langle u, u \rangle}. + \end{equation*} + + \subsection*{Vectorization, amplitude and kernel} \label{vectorization_amplitude_and_kernel} + + Let $X$ be a set, for example, the set of all + %\hyperref[persistence_diagram]{persistence diagrams} + persistence diagrams. A \textit{vectorization} for $X$ is a function + \begin{equation*} + \phi : X \to V + \end{equation*} + where $V$ is a vector space. + + An \textit{amplitude} on $X$ is a function + \begin{equation*} + A : X \to \mathbb R + \end{equation*} + for which there exists a vectorization $\phi : X \to V$ with $V$ a + %\hyperref[normed_space]{normed space} + normed space such that + \begin{equation*} + A(x) = ||\phi(x)|| + \end{equation*} + for all $x \in X$. + + A \textit{kernel} on the set $X$ is a function + \begin{equation*} + k : X \times X \to \mathbb R + \end{equation*} + for which there exists a vectorization $\phi : X \to V$ with $V$ an + %\hyperref[inner_product_and_norm]{inner product space} + inner product space such that + \begin{equation*} + k(x,y) = \langle \phi(x), \phi(y) \rangle + \end{equation*} + for each $x,y \in X$. + + \subsection*{Euclidean distance and $l^p$-norms} \label{euclidean_distance_and_norm} + + The vector space $\mathbb R^n$ is an + % \hyperref[inner_product_space]{inner product space} + inner product space with inner product + \begin{equation*} + \langle x, y \rangle = (x_1-y_1)^2 + \cdots + (x_n-y_n)^2. + \end{equation*} + This inner product is referred to as \textit{dot product} and the associated norm and distance function are respectively named \textit{euclidean norm} and \textit{euclidean distance}. + + For any $p \in (0,\infty]$ the pair $\mathbb R^n, ||-||_p$ with + \begin{equation*} + ||x||_p = (x_1^p + \cdots + x_n^p)^{1/p} + \end{equation*} + if $p$ is finite and + \begin{equation*} + ||x||_{\infty} = max\{x_i\ |\ i = 1,\dots,n\} + \end{equation*} + is a normed spaced and its norm is referred to as the $l^p$\textit{-norm}. + + \subsection*{Distance matrices and point clouds} \label{distance_matrices_and_point_clouds} + + Let $(X, d)$ be a finite + % \hyperref[metric_space]{metric space} + metric space. A \textit{distance matrix} associated to it is obtained by choosing a total order on $X = {x_1 < \cdots < x_m}$ and setting the $(i,j)$-entry to be equal to $d(x_i, x_j)$. + + A \textit{point cloud} is a finite subset of $\mathbb{R}^n$ (for some $n$) together with the metric induced from the + % \hyperref[euclidean_distance_and_norm]{eucliden distance} + euclidean distance. + + \subsection*{$L^p$-norms} \label{functional_lp} + + Let $U \subseteq \mathbb R^n$ and $C(U, \mathbb R)$ be the set of continuous real-valued functions on $U$. A function $f \in C(U, \mathbb R)$ is said to be $p$\textit{-integrable} if + \begin{equation*} + \int_U |f(x)|^p dx + \end{equation*} + is finite. The subset of $p$-integrable functions together with the assignment $||-||_p$ + \begin{equation*} + f \mapsto \left( \int_U |f(x)|^p dx \right)^{1/p} + \end{equation*} + is a + % \hyperref[normed_space]{normed space} + normed space and $||-||_p$ is referred to as the $L^p$\textit{-norm}. + + The only $L^p$-norm that is induced from an inner product is $L^2$, and the inner product is given by + \begin{equation*} + \langle f, g \rangle = \left(\int_U |f(x)-g(x)|^2 dx\right)^{1/2}. + \end{equation*} \section{Homology} \subsection*{Cubical complex} \label{cubical_complex} @@ -51,10 +207,10 @@ \end{equation*} we define for $i = 1, \dots, n$ the following two elementary cubes \begin{equation*} - d_i^\pm I^N = I_{a_1} \times \cdots \times d^\pm I_{a_{k_i}} \times \cdots \times I_{a_{N.}} + d_i^\pm I^N = I_{a_1} \times \cdots \times d^\pm I_{a_{k_i}} \times \cdots \times I_{a_{N}}. \end{equation*} - A \textit{cubical complex} is a finite set of elementary cubes of $\mathbb{R}^N$, and a \textit{subcomplex} of $X$ is a cubical complex whose elementary cubes are also in $X$. We denote the set of $n$-dimensional cubes as $X_n$. + A \textit{cubical complex} is a finite set of elementary cubes of $\mathbb{R}^N$, and a \textit{subcomplex} of $X$ is a cubical complex whose elementary cubes are also in $X$. \paragraph{\\ Reference:} \cite{mischaikow04computational} @@ -85,8 +241,6 @@ The elements of $X$ are called \textit{simplices} and the \textit{dimension} of a simplex $x$ is defined by $|x| = \# x - 1$ where $\# x$ denotes the cardinality of $x$. Simplices of dimension $d$ are called $d$-simplices. We abuse terminology and refer to the elements of $V$ and to their associated $0$-simplices both as \textit{vertices}. - The $k$\textit{-skeleton} $X_k$ of a simplicial complex $X$ is the subcomplex containing all simplices of dimension at most $k$. A simplicial complex is said to be $d$\textit{-dimensional} if $d$ is the smallest integer satisfying $X = X_d$. - A \textit{simplicial map} between simplicial complexes is a function between their vertices such that the image of any simplex via the induced map is a simplex. A simplicial complex $X$ is a \textit{subcomplex} of a simplicial complex $Y$ if every simplex of $X$ is a simplex of $Y$. @@ -145,7 +299,7 @@ \subsection*{Simplicial chains and simplicial homology} \label{simplicial_chains_and_simplicial_homology} - Let $X$ be an ordered or directed simplicial complex. Define its \textit{simplicial chain complex with} $\Bbbk$\textit{-coefficients} $C_*(X; \Bbbk)$ by + Let $X$ be an ordered or directed simplicial complex and denote the subset of $n$-simplices by $X_n$. Define its \textit{simplicial chain complex with} $\Bbbk$\textit{-coefficients} $C_*(X; \Bbbk)$ by \begin{equation*} C_n(X; \Bbbk) = \Bbbk\{X_n\}, \qquad \partial_n(x) = \sum_{i=0}^{n} (-1)^i d_ix \end{equation*} @@ -161,7 +315,7 @@ \subsection*{Cubical chains and cubical homology} \label{cubical_chains_and_cubical_homology} - Let $X$ be a cubical complex. Define its \textit{cubical chain complex with} $\Bbbk$\textit{-coefficients} $C_*(X; \Bbbk)$ by + Let $X$ be a cubical complex and denote the subset of $n$-cubes by $X_n$. Define the \textit{cubical chain complex with} $\Bbbk$\textit{-coefficients} $C_*(X; \Bbbk)$ by \begin{equation*} C_n(X; \Bbbk) = \Bbbk\{X_n\}, \qquad \partial_n x = \sum_{i = 1}^{n} (-1)^{i-1}(d^+_i x - d^-_i x) \end{equation*} @@ -170,6 +324,8 @@ % \hyperref[homology_and_cohomology]{homology and cohomology} homology and cohomology of this chain complex. We use the notation $H_*(X; \Bbbk)$ and $H^*(X; \Bbbk)$ for these. + \section{Persistence} + \subsection*{Filtered complex} \label{filtered_complex} A \textit{filtered complex} is a collection of simplicial or cubical complexes $\{X_s\}_{s \in \mathbb R}$ such that $X_s$ is a subcomplex of $X_t$ for each $s \leq t$. @@ -225,14 +381,14 @@ \begin{equation*} H_*(X(s); \Bbbk) \end{equation*} - with structure maps $H_*(f_{st}) : H_*(X(s); \Bbbk) \to H_*(X(t); \Bbbk)$ induced form the maps $f_{st.}$ In general, the collection constructed this way needs not satisfy the tameness condition of a + with structure maps $H_*(f_{st}) : H_*(X(s); \Bbbk) \to H_*(X(t); \Bbbk)$ induced form the maps $f_{st}.$ In general, the collection constructed this way needs not satisfy the tameness condition of a % \hyperref[persistence_module]{persistence module} persistence module, but we restrict attention to the cases where it does. Its \textit{persistence simplicial cohomology with} $\Bbbk$\textit{-coefficients} is defined analogously. \subsection*{Vietoris-Rips complex and Vietoris-Rips persistence} \label{vietoris-rips_complex_and_vietoris-rips_persistence} Let $(X, d)$ be a - % \hyperref[finite_metric_spaces_and_point_clouds]{finite metric space} + % \hyperref[distance_matrices_and_point_clouds]{finite metric space} finite metric space. Define the Vietoris-Rips complex of $X$ as the % \hyperref[filtered_complex]{filtered complex} filtered complex $VR_s(X)$ that contains a subset of $X$ as a simplex if all pairwise distances in the subset are less than or equal to $s$, explicitly @@ -251,11 +407,10 @@ % \hyperref[clique_and_flag_complexes]{filtered clique complex} filtered clique complex associated to $(X \times X ,w)$. - \subsection*{\v{C}ech complex and \v{C}ech persistence} \label{cech_complex_and_cech_persistence} Let $(X, d)$ be a - % \hyperref[finite_metric_spaces_and_point_clouds]{point cloud} + % \hyperref[distance_matrices_and_point_clouds]{point cloud} point cloud. Define the \v{C}ech complex of $X$ as the % \hyperref[filtered_complex]{filtered complex} filtered complex $\check{C}_s(X)$ that is empty if $s<0$ and, if $s \geq 0$, contains a subset of $X$ as a simplex if the balls of radius $s$ with centers in the subset have a non-empty intersection, explicitly @@ -288,7 +443,7 @@ \end{equation*} Given a % \hyperref[persistence_module]{persistence module} - persistence module, its associated persistence diagram is determined by the following condition: for each pair $s,t$ the number counted with multiplicity of points $(b,d)$ in the multiset, satisfying $b \leq s \leq t < d$ is equal to the rank of $f_{st.}$ + persistence module, its associated persistence diagram is determined by the following condition: for each pair $s,t$ the number counted with multiplicity of points $(b,d)$ in the multiset, satisfying $b \leq s \leq t < d$ is equal to the rank of $f_{st}.$ A well known result establishes that there exists an isomorphism between two persistence module if and only if their persistence diagrams are equal. @@ -302,7 +457,7 @@ The limit $p \to \infty$ defines the \textit{bottleneck distance}. More explicitly, it is the infimum over the same set of bijections of the value \begin{equation*} - \sup_{x \in D_1 \cup \Delta} ||x - \gamma(x)||_{\infty.} + \sup_{x \in D_1 \cup \Delta} ||x - \gamma(x)||_{\infty}. \end{equation*} The set of persistence diagrams together with any of the distances above is a @@ -323,12 +478,12 @@ \begin{equation*} \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ \end{equation*} - and $c_+ := \max(c,0)$. The function $\lambda_k$ is referred to as the \textit{$k$-layer of the persistence landscape}. + and $c_+ := \max(c,0)$. The function $\lambda_k$ is referred to as the $k$-\textit{layer of the persistence landscape}. We describe the graph of each $\lambda_k$ intuitively. For each $i \in I$, draw an isosceles triangle with base the interval $(b_i, d_i)$ on the horizontal $t$-axis, and sides with slope 1 and $-1$. This subdivides the plane into a number of polygonal regions. Label each of these regions by the number of triangles containing it. If $P_k$ is the union of the polygonal regions with values at least $k$, then the graph of $\lambda_k$ is the upper contour of $P_k$, with $\lambda_k(a) = 0$ if the vertical line $t=a$ does not intersect $P_k$. The persistence landscape construction defines a - %\hyperref[vectorization_kernel_and_amplitude]{vectorization} + %\hyperref[vectorization_amplitude_and_kernel]{vectorization} vectorization of the set of persistence diagrams with target the vector space of real-valued function on $\mathbb N \times \mathbb R$. For any $p = 1,\dots,\infty$ we can restrict attention to persistence diagrams $D$ whose associated persistence landscape $\lambda$ is %\hyperref[lp_norm]{$p$-integrable} $p$-integrable, that is to say, @@ -339,9 +494,11 @@ \begin{equation*} ||\lambda_i||_p = \left( \int_{\mathbb R} \lambda_i^p(x)\, dx \right)^{1/p} \end{equation*} - is finite. In this case we refer to \eqref{equation:persistence_landscape_norm} as the - %\hyperref[vectorization_kernel_and_amplitude]{amplitude} - \textit{landscape} $p$-\textit{amplitude} of $D$. + is finite. In this case we refer to \eqref{equation:persistence_landscape_norm} as the $p$-\textit{landscape norm} of $D$ and, for $p = 2$, define the value of the \textit{landscape kernel} on two persistence diagrams $D$ and $E$ as + \begin{equation*} + \langle \lambda, \mu \rangle = \left(\sum_{i \in \mathbb N} \int_{\mathbb R} |\lambda_i(x) - \mu_i(x)|^2\, dx\right)^{1/2} + \end{equation*} + where $\lambda$ and $\mu$ are their associated persistence landscapes. \paragraph{\\ References:} \cite{bubenik2015statistical} @@ -349,7 +506,7 @@ Let $D = \{(b_i, d_i)\}_{i \in I}$ be a %\hyperref[persistence_diagram]{persistence diagram} - persistence diagram and $w = \{w_i\}_{i \in I}$ a set of positive real numbers. The \textit{silhouette of $D$ weighted by $w$} is the function $\phi : \mathbb R \to \mathbb R$ defined by + persistence diagram and $w = \{w_i\}_{i \in I}$ a set of positive real numbers. The \textit{silhouette of} $D$ \textit{weighted by} $w$ is the function $\phi : \mathbb R \to \mathbb R$ defined by \begin{equation*} \phi(t) = \frac{\sum_{i \in I}w_i \Lambda_i(t)}{\sum_{i \in I}w_i}, \end{equation*} @@ -357,12 +514,34 @@ \begin{equation*} \Lambda_i(t) = \left[ \min \{t-b_i, d_i-t\}\right]_+ \end{equation*} - and $c_+ := \max(c,0)$. When $w_i = \vert d_i - b_i \vert^p$ for $0 < p \leq \infty$ we refer to $\phi$ as the \textit{$p$-power-weighted silhouette} of $D$. The silhouette construction defines a - %\hyperref[vectorization_kernel_and_amplitude]{vectorization} - vectorization of the set of persistence diagrams with target the vector space of continuous real-valued functions on $\mathbb R$. + and $c_+ := \max(c,0)$. When $w_i = \vert d_i - b_i \vert^p$ for $0 < p \leq \infty$ we refer to $\phi$ as the $p$-\textit{power-weighted silhouette} of $D$. The silhouette construction defines a + %\hyperref[vectorization_amplitude_and_kernel]{vectorization} + vectorization of the set of persistence diagrams with target the vector space of continuous real-valued functions on $\mathbb R$. \paragraph{\\ References:} \cite{chazal2014stochastic} + \subsection*{Heat vectorizations} \label{heat_vectorization} + + Considering the points in a persistence diagram as the support of Dirac deltas one can construct, for any $t > 0$, + %\hyperref[vectorization_amplitude_and_kernel]{vectorization} + two vectorizations of the set of persistence diagrams to the set of continuous real-valued function on the first quadrant $\mathbb{R}^2_{>0}$. The \textit{symmetry heat vectorization} is constructed for every persistence diagram $D$ by solving the heat equation + \begin{align} \label{equation: heat equation} + \Delta_x(u) &= \partial_t u && \text{on } \Omega \times \mathbb R_{>0} \nonumber \\ + u &= 0 && \text{on } \{x_1 = x_2\} \times \mathbb R_{\geq 0} \\ + u &= \sum_{p \in D} \delta_p && \text{on } \Omega \times {0} \nonumber + \end{align} + where $\Omega = \{(x_1, x_2) \in \mathbb R^2\ |\ x_1 \leq x_2\}$, then solving the same equation after precomposing the data of \eqref{equation: heat equation} with the change of coordinates $(x_1, x_2) \mapsto (x_2, x_1)$, and defining the image of $D$ to be the difference between these two solutions at the chosen time $t$. + + Similarly, the \textit{rotation heat vectorization} is defined by sending $D$ to the solution, evaluated at time $t$, of the equation obtained by precomposing the data of \eqref{equation: heat equation} with the change of coordinates $(x_1, x_2) \mapsto (x_1, x_2-x_1)$. + + We recall that the solution to the heat equation with initial condition given by a Dirac delta supported at $p \in \mathbb R^2$ is + \begin{equation*} + \frac{1}{4 \pi t} \exp\left(-\frac{||p-x||^2}{4t}\right) + \end{equation*} + and, to highlight the connection with normally distributed random variables, it is customary to use the the change of variable $\sigma = \sqrt{2t}$. + + \paragraph{\\ References:} \cite{reininghaus2015stable,adams2017persistence} + \subsection*{Persistence entropy} \label{persistence_entropy} Intuitively, this is a measure of the entropy of the points in a @@ -386,118 +565,6 @@ The name is inspired from the case when the persistence diagram comes from persistent homology. - \subsection*{Metric space} \label{metric_space} - A set $X$ with a function - \begin{equation*} - d : X \times X \to \mathbb R - \end{equation*} - is said to be a \textit{metric space} if the values of $d$ are all non-negative and for all $x,y,z \in X$ - \begin{equation*} - d(x,y) = 0\ \Leftrightarrow\ x = y - \end{equation*} - \begin{equation*} - d(x,y) = d(y,x) - \end{equation*} - \begin{equation*} - d(x,z) \leq d(x,y) + d(y, z). - \end{equation*} - In this case the $d$ is referred to as the \textit{metric} or the \textit{distance function}. - - \subsection*{Inner product and norm} \label{inner_product_and_norm} - - A vector space $V$ together with a function - \begin{equation*} - \langle -, - \rangle : V \times V \to \mathbb R - \end{equation*} - is said to be an \textit{inner product space} if for all $u,v,w \in V$ and $a \in \mathbb R$ - \begin{equation*} - u \neq 0\ \Rightarrow\ \langle u, u \rangle > 0 - \end{equation*} - \begin{equation*} - \langle u, v\rangle = \langle v, u\rangle - \end{equation*} - \begin{equation*} - \langle au+v, w \rangle = a\langle u, w \rangle + \langle v, w \rangle. - \end{equation*} - The function $\langle -, - \rangle$ is referred to as the \textit{inner product}. - - A vector space $V$ together with a function - \begin{equation*} - ||-|| : V \to \mathbb R - \end{equation*} - is said to be an \textit{normed space} if the values of $||-||$ are all non-negative and for all $u,v \in V$ and $a \in \mathbb R$ - \begin{equation*} - ||v|| = 0\ \Leftrightarrow\ u = 0 - \end{equation*} - \begin{equation*} - ||a u || = |a|\, ||u|| - \end{equation*} - \begin{equation*} - ||u+v|| = ||u|| + ||v||. - \end{equation*} - The function $||-||$ is referred to as the \textit{norm}. - - An inner product space is naturally a norm space with - \begin{equation*} - ||u|| = \sqrt{\langle u, u \rangle} - \end{equation*} - and a norm space is naturally a - %\hyperref[metric_space]{metric space} - metric space with distance function - \begin{equation*} - d(u,v) = ||u-v||. - \end{equation*} - - \subsection*{Euclidean distance and norm} \label{euclidean_distance_and_norm} - - The vector space $\mathbb R^n$ is an - % \hyperref[metric_inner_product_and_kernel]{inner product space} - inner product space with inner product - \begin{equation*} - \langle x, y \rangle = (x_1-y_1)^2 + \cdots + (x_n-y_n)^2. - \end{equation*} - This inner product is referred to as \textit{dot product} and the associated norm and distance function are respectively named \textit{euclidean norm} and \textit{euclidean distance}. - - \subsection*{Vectorization, kernel and amplitude} \label{vectorization_kernel_and_amplitude} - - Let $X$ be a set, for example, the set of all - %\hyperref[persistence_diagram]{persistence diagrams} - persistence diagrams. A \textit{vectorization} for $X$ is a function - \begin{equation*} - \phi : X \to V - \end{equation*} - where $V$ is a vector space. A \textit{kernel} on the set $X$ is a function - \begin{equation*} - k : X \times X \to \mathbb R - \end{equation*} - for which there exists a vectorization $\phi : X \to V$ with $V$ an - %\hyperref[inner_product_and_norm]{inner product space} - inner product space such that - \begin{equation*} - k(x,y) = \langle \phi(x), \phi(y) \rangle - \end{equation*} - for each $x,y \in X$. Similarly, an \textit{amplitude} on $X$ is a function - \begin{equation*} - A : X \to \mathbb R - \end{equation*} - for which there exists a vectorization $\phi : X \to V$ with $V$ a - %\hyperref[inner_product_and_norm]{normed space} - normed space such that - \begin{equation*} - A(x) = ||\phi(x)|| - \end{equation*} - for all $x \in X$. - - \subsection*{Finite metric spaces and point clouds} \label{finite_metric_spaces_and_point_clouds} - - A \textit{finite metric space} is a finite set together with a - % \hyperref[metric_inner_product_and_kernel]{metric} - metric. A \textit{distance matrix} associated to a finite metric space is obtained by choosing a total order on the finite set and setting the $(i,j)$-entry to be equal to the distance between the $i$-th and $j$-th elements. - - A \textit{point cloud} is a finite subset of $\mathbb{R}^n$ (for some $n$) together with the metric induced from the - % \hyperref[euclidean_distance_and_norm]{eucliden distance} - euclidean distance. - \section{Time series} \subsection*{Time series} \label{time_series} @@ -512,7 +579,7 @@ \begin{equation*} f : U \subseteq \mathbb R \to \mathbb R \end{equation*} - we obtain a new time series $\{f(x_i)\}_{i = 0.}^n$ + we obtain a new time series $\{f(x_i)\}_{i = 0}^n$. Generalizing the previous construction we can define a time series from a function \begin{equation*} diff --git a/examples/mapper_quickstart.ipynb b/examples/mapper_quickstart.ipynb index 6ec97d846..67c561b59 100644 --- a/examples/mapper_quickstart.ipynb +++ b/examples/mapper_quickstart.ipynb @@ -44,9 +44,8 @@ " make_mapper_pipeline,\n", " Projection,\n", " plot_static_mapper_graph,\n", - " plot_interactive_mapper_graph,\n", + " plot_interactive_mapper_graph\n", ")\n", - "from gtda.mapper.utils.visualization import set_node_sizeref\n", "\n", "# ML tools\n", "from sklearn import datasets\n", @@ -177,9 +176,9 @@ "metadata": {}, "outputs": [], "source": [ - "plotly_kwargs = {\"node_trace_marker_colorscale\": \"Blues\"}\n", + "plotly_params = {\"node_trace\": {\"marker_colorscale\": \"Blues\"}}\n", "fig = plot_static_mapper_graph(\n", - " pipe, data, color_by_columns_dropdown=True, plotly_kwargs=plotly_kwargs\n", + " pipe, data, color_by_columns_dropdown=True, plotly_params=plotly_params\n", ")\n", "fig.show(config={'scrollZoom': True})" ] @@ -287,6 +286,7 @@ "metadata": {}, "source": [ "### Change the layout dimension\n", + "\n", "It is also possible to visualise the Mapper graph in 3-dimensions by configuring the `layout_dim` argument:" ] }, @@ -300,6 +300,26 @@ "fig.show(config={'scrollZoom': True})" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Change the node size scale\n", + "\n", + "In general, node sizes are proportional to the number of dataset elements contained in the nodes. Sometimes, however, the default scale leads to graphs which are difficult to decipher, due to e.g. excessively small nodes. The `node_scale` parameter can be used to configure this scale. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "node_scale = 30\n", + "fig = plot_static_mapper_graph(pipe, data, layout_dim=3, node_scale=node_scale)\n", + "fig.show(config={'scrollZoom': True})" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -322,7 +342,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The resulting graph is an [python-igraph](https://igraph.org/python/) object that contains metadata that is stored in the form of dictionaries. We can access this data as follows:" + "The resulting graph is a [`python-igraph`](https://igraph.org/python/) object that contains metadata that is stored in the form of dictionaries. We can access this data as follows:" ] }, { @@ -360,46 +380,12 @@ "outputs": [], "source": [ "print(\n", - " \"Node Id: {}, \\nNode elements: {}, \\nData points: {}\".format(\n", + " \"Node ID: {}, \\nNode elements: {}, \\nData points: {}\".format(\n", " node_id[0], node_elements[0], data[node_elements[0]]\n", " )\n", ")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `node_elements` are handy for situations when we want to customise e.g. the size of the node scale. In this example, we use the utility function `set_node_sizeref()` and pass the function as a plotly argument:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Configure scale for node sizes\n", - "plotly_kwargs = {\n", - " \"node_trace_marker_sizeref\": set_node_sizeref(node_elements, node_scale=30)\n", - "}\n", - "fig = plot_static_mapper_graph(\n", - " pipe,\n", - " data,\n", - " layout_dim=3,\n", - " color_by_columns_dropdown=True,\n", - " plotly_kwargs=plotly_kwargs,\n", - ")\n", - "fig.show(config={'scrollZoom': True})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The resulting graph is much easier to decipher with the enlarged node scaling!" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -432,7 +418,7 @@ "metadata": {}, "outputs": [], "source": [ - "fig = plot_static_mapper_graph(pipe, data, plotly_kwargs=None)\n", + "fig = plot_static_mapper_graph(pipe, data)\n", "fig.show(config={'scrollZoom': True})" ] }, diff --git a/examples/voids_on_the_plane.ipynb b/examples/voids_on_the_plane.ipynb index 19e7c24e1..f67fbed3b 100644 --- a/examples/voids_on_the_plane.ipynb +++ b/examples/voids_on_the_plane.ipynb @@ -32,7 +32,7 @@ "from gtda.homology import VietorisRipsPersistence\n", "import itertools\n", "\n", - "import matplotlib.pyplot as plt\n", + "import matplotlib.pyplot as plt # Not a requirement of giotto-tda, but is needed here\n", "\n", "np.random.seed(1) # Set numpy's random seed" ] diff --git a/gtda/_version.py b/gtda/_version.py index 7ffe6f6dd..589f7abfa 100644 --- a/gtda/_version.py +++ b/gtda/_version.py @@ -19,4 +19,4 @@ # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.2.1' +__version__ = '0.2.2' diff --git a/gtda/externals/__init__.py b/gtda/externals/__init__.py index 8b1378917..00bffafff 100644 --- a/gtda/externals/__init__.py +++ b/gtda/externals/__init__.py @@ -1 +1,20 @@ +""""Python bindings for external dependencies.""" +# License: GNU AGPLv3 +from .modules.gtda_bottleneck import bottleneck_distance +from .modules.gtda_wasserstein import wasserstein_distance +from .python import ripser, SparseRipsComplex, CechComplex, CubicalComplex, \ + PeriodicCubicalComplex, SimplexTree, WitnessComplex, StrongWitnessComplex + +__all__ = [ + 'bottleneck_distance', + 'wasserstein_distance', + 'ripser', + 'SparseRipsComplex', + 'CechComplex', + 'CubicalComplex', + 'PeriodicCubicalComplex', + 'SimplexTree', + 'WitnessComplex', + 'StrongWitnessComplex' + ] diff --git a/gtda/mapper/pipeline.py b/gtda/mapper/pipeline.py index aad57acf7..5f65d5b73 100644 --- a/gtda/mapper/pipeline.py +++ b/gtda/mapper/pipeline.py @@ -216,14 +216,13 @@ def make_mapper_pipeline(scaler=None, memory : None, str or object with the joblib.Memory interface, \ optional, default: ``None`` - Used to cache the fitted transformers of the pipeline. By default, no - caching is performed. If a string is given, it is the path to the - caching directory. Enabling caching triggers a clone of the - transformers before fitting. Therefore, the transformer instance - given to the pipeline cannot be inspected directly. Use the attribute - ``named_steps`` or ``steps`` to inspect estimators within the - pipeline. Caching the transformers is advantageous when fitting is - time consuming. + Used to cache the fitted transformers which make up the pipeline. This + is advantageous when the fitting of early steps is time consuming and + only later steps in the pipeline are modified (e.g. using + :meth:`set_params`) before refitting on the same data. To be used + exactly as for :func:`sklearn.pipeline.make_pipeline`. By default, no + no caching is performed. If a string is given, it is the path to the + caching directory. See [3]_. verbose : bool, optional, default: ``False`` If True, the time elapsed while fitting each step will be printed as it @@ -236,7 +235,8 @@ def make_mapper_pipeline(scaler=None, Examples -------- - >>> # Example of basic usage with default parameters + Basic usage with default parameters + >>> import numpy as np >>> from gtda.mapper import make_mapper_pipeline >>> mapper = make_mapper_pipeline() @@ -258,16 +258,17 @@ def make_mapper_pipeline(scaler=None, dict_keys(['node_id', 'pullback_set_label', 'partial_cluster_label', 'node_elements']) >>> # Find which points belong to first node of graph - >>> node_id, node_elements = mapper_graph['node_metadata']['node_id'], - ... mapper_graph['node_metadata']['node_elements'] - >>> print(f'Node Id: {node_id[0]}, Node elements: {node_elements[0]}, ' - f'Data points: {X[node_elements[0]]}') + >>> node_id = mapper_graph['node_metadata']['node_id'] + >>> node_elements = mapper_graph['node_metadata']['node_elements'] + >>> print(f"Node ID: {node_id[0]}, Node elements: {node_elements[0]}, " + ... f"Data points: {X[node_elements[0]]}") Node Id: 0, Node elements: [8768], Data points: [[0.01838998 0.76928754 0.98199244 0.0074299 ]] - >>> ####################################################################### - >>> # Example using a scaler from scikit-learn, a filter function from - >>> # gtda.mapper.filter, and a clusterer from gtda.mapper.cluster + + Using a scaler from scikit-learn, a filter function from + gtda.mapper.filter, and a clusterer from gtda.mapper.cluster + >>> from sklearn.preprocessing import MinMaxScaler >>> from gtda.mapper import Projection, FirstHistogramGap >>> scaler = MinMaxScaler() @@ -276,15 +277,17 @@ def make_mapper_pipeline(scaler=None, >>> mapper = make_mapper_pipeline(scaler=scaler, ... filter_func=filter_func, ... clusterer=clusterer) - >>> ####################################################################### - >>> # Example using a callable acting on each row of X separately + + Using a callable acting on each row of X separately + >>> import numpy as np >>> from gtda.mapper import OneDimensionalCover >>> cover = OneDimensionalCover() >>> mapper.set_params(scaler=None, filter_func=np.sum, cover=cover) - >>> ####################################################################### - >>> # Example setting the memory parameter to cache each step and avoid - >>> # recomputation of early steps + + Setting the memory parameter to cache each step and avoid recomputation + of early steps + >>> from tempfile import mkdtemp >>> from shutil import rmtree >>> cachedir = mkdtemp() @@ -301,9 +304,10 @@ def make_mapper_pipeline(scaler=None, [Pipeline] ............. (step 3 of 3) Processing nerve, total= 0.0s >>> # Clear the cache directory when you don't need it anymore >>> rmtree(cachedir) - >>> ####################################################################### - >>> # Example using a large dataset for which parallelism in - >>> # clustering across the pullback cover sets can be beneficial + + Using a large dataset for which parallelism in clustering across + the pullback cover sets can be beneficial + >>> from sklearn.cluster import DBSCAN >>> mapper = make_mapper_pipeline(clusterer=DBSCAN(), ... n_jobs=6, @@ -324,8 +328,7 @@ def make_mapper_pipeline(scaler=None, See also -------- - :class:`MapperPipeline`, - :meth:`~gtda.mapper.utils.decorators.method_to_transform` + MapperPipeline, :func:`~gtda.mapper.utils.decorators.method_to_transform` References ---------- @@ -337,6 +340,10 @@ def make_mapper_pipeline(scaler=None, `joblib documentation `_. + .. [3] "Caching transformers: avoid repeated computation", in + `scikit-learn documentation \ + `_. + """ # TODO: Implement parameter validation diff --git a/gtda/mapper/tests/test_visualization.py b/gtda/mapper/tests/test_visualization.py index 3bf90781b..0032e4067 100644 --- a/gtda/mapper/tests/test_visualization.py +++ b/gtda/mapper/tests/test_visualization.py @@ -52,12 +52,16 @@ def test_is_data_present(self): fig = plot_static_mapper_graph(pipe, X, color_variable=colors, clone_pipeline=False) - xy = np.stack([fig.get_state()['_data'][1][c] - for c in ['x', 'y']]).transpose() - assert X.shape >= xy.shape + node_trace_x = fig.get_state()['_data'][1]["x"] + node_trace_y = fig.get_state()['_data'][1]["y"] - real_colors = fig.get_state()['_data'][1]['marker']['color'] - assert len(real_colors) == xy.shape[0] + assert node_trace_x["shape"][0] == node_trace_y["shape"][0] + + num_nodes = node_trace_x["shape"][0] + assert len(X) >= num_nodes + + fig_colors = fig.get_state()['_data'][1]['marker']['color'] + assert len(fig_colors) == num_nodes class TestInteractivePlot(TestCaseNoTemplate): @@ -73,8 +77,8 @@ def _get_widget_by_trait(self, fig, key, val=None): pass def _get_size_from_hovertext(self, s): - from re import split - return int(split(':|<', s)[-1]) + size_str = s.split("
")[1].split(": ")[1] + return int(size_str) def test_cluster_sizes(self): """Verify that the total number of calculated clusters is equal to @@ -89,7 +93,7 @@ def test_cluster_sizes(self): ['_data'][1]['hovertext']] g = pipe.fit_transform(X) - node_size_real = [len(l) - for l in g['node_metadata']['node_elements']] + node_size_real = [len(node) + for node in g['node_metadata']['node_elements']] assert sum(node_sizes_vis) == sum(node_size_real) diff --git a/gtda/mapper/utils/_visualization.py b/gtda/mapper/utils/_visualization.py new file mode 100644 index 000000000..38eae138d --- /dev/null +++ b/gtda/mapper/utils/_visualization.py @@ -0,0 +1,396 @@ +"""Graph layout functions and plotly layout functions.""" +# License: GNU AGPLv3 + +import operator +from copy import deepcopy +from functools import reduce, partial + +import numpy as np +import plotly.graph_objs as go + +PLOT_OPTIONS_NODE_TRACE_DEFAULTS = { + "name": "node_trace", + "mode": "markers", + "hoverinfo": "text", + "marker": { + "colorscale": "viridis", + "opacity": 1., + "showscale": True, + "reversescale": False, + "line": {"width": 1, "color": "#888"}, + "sizemode": "area", + "sizemin": 4, + "colorbar": { + "thickness": 15, "title": "", "xanchor": "left", + "titleside": "right" + } + } +} + +PLOT_OPTIONS_EDGE_TRACE_DEFAULTS = { + "name": "edge_trace", + "mode": "lines", + "line": {"width": 1, "color": "#888"}, + "hoverinfo": "none" +} + +PLOT_OPTIONS_LAYOUT_COMMON_DEFAULTS = { + "showlegend": False, + "hovermode": "closest", + "title": "", + "margin": {"b": 20, "l": 5, "r": 5, "t": 40}, + "autosize": False, + "annotations": [] +} + +PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_2D = { + "title": "", "showgrid": False, "zeroline": False, "showticklabels": False, + "ticks": "", "showline": False +} + +PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_3D = { + "title": "", "showbackground": False, "showline": False, "zeroline": False, + "showgrid": False, "showticklabels": False, +} + +PLOT_OPTIONS_LAYOUT_DEFAULTS = { + "common": PLOT_OPTIONS_LAYOUT_COMMON_DEFAULTS, + 2: { + "template": "simple_white", + "xaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_2D, + "yaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_2D + }, + 3: { + "scene": { + "xaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_3D, + "yaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_3D, + "zaxis": PLOT_OPTIONS_LAYOUT_AXES_DEFAULTS_3D + } + } +} + + +def _set_node_sizeref(node_sizes, node_scale=12): + # Formula from Plotly https://plot.ly/python/bubble-charts/ + return 2. * max(node_sizes) / (node_scale ** 2) + + +def _round_to_n_sig_figs(x, n=3): + """Round a number x to n significant figures.""" + if n is None: + return x + if not x: + return 0 + return np.round(x, -int(np.floor(np.log10(np.abs(x)))) + (n - 1)) + + +def _get_node_size(node_elements): + # TODO: Add doc strings to all functions + return list(map(len, node_elements)) + + +def _get_node_text( + node_ids, num_node_elements, node_summary_statistics +): + return [ + f"Node ID: {node_id}
Node size: {num_elements}" + f"
Summary statistic: {node_summary_statistic}" + for node_id, num_elements, node_summary_statistic in zip( + node_ids, num_node_elements, node_summary_statistics + ) + ] + + +def _get_node_summary(data, node_elements, summary_statistic): + return list(map(summary_statistic, (data[itr] for itr in node_elements))) + + +def _get_column_color_buttons( + data, is_data_dataframe, node_elements, node_colors_color_variable, + summary_statistic, hovertext_color_variable, + colorscale_for_hoverlabel, n_sig_figs +): + # TODO: Consider opting for just-in-time computation instead of computing + # all node summary values ahead of time. Solution should preserve scroll + # zoom functionality of 2D static visualisation. + def replace_summary_statistic(current_hovertext, new_statistic): + pos = current_hovertext.rfind(" ") + new_hovertext = current_hovertext[:pos] + \ + f" {_round_to_n_sig_figs(new_statistic, n=n_sig_figs)}" + return new_hovertext + + if is_data_dataframe: + columns_to_color = data.columns + else: + columns_to_color = range(data.shape[1]) + + column_color_buttons = [ + { + "args": [{ + "marker.color": [None, node_colors_color_variable], + "hovertext": [None, hovertext_color_variable] + }], + "label": "color_variable", + "method": "restyle" + } + ] + + for column in columns_to_color: + if is_data_dataframe: + column_values = data[column].to_numpy() + else: + column_values = data[:, column] + + node_colors = _get_node_summary( + column_values, node_elements, summary_statistic + ) + hovertext = list(map( + replace_summary_statistic, hovertext_color_variable, + node_colors + )) + + new_button = { + "args": [{ + "marker.color": [None, node_colors], + "hovertext": [None, hovertext] + }], + "label": f"Column {column}", + "method": "restyle" + } + + if colorscale_for_hoverlabel is not None: + node_colors = np.asarray(node_colors) + min_col = np.min(node_colors) + max_col = np.max(node_colors) + new_button["args"][0]["hoverlabel.bgcolor"] = [ + None, + _get_colors_for_vals(node_colors, min_col, max_col, + colorscale_for_hoverlabel) + ] + + column_color_buttons.append(new_button) + + return column_color_buttons + + +def _infer_color_variable_kind(color_variable, data): + """Determine whether color_variable is array, pandas dataframe, callable, + or scikit-learn (fit-)transformer.""" + if hasattr(color_variable, "dtype") or hasattr(color_variable, "dtypes"): + if len(color_variable) != len(data): + raise ValueError( + "color_variable and data must have the same length.") + color_variable_kind = "scalars" + elif hasattr(color_variable, "transform"): + color_variable_kind = "transformer" + elif hasattr(color_variable, "fit_transform"): + color_variable_kind = "fit_transformer" + elif callable(color_variable): + color_variable_kind = "callable" + elif color_variable is None: + color_variable_kind = "none" + else: # Assume color_variable is a selection of columns + color_variable_kind = "else" + + return color_variable_kind + + +def _get_node_summary_statistics( + data, is_data_dataframe, node_elements, summary_statistic, + color_variable +): + """Calculate values of node summary statistics.""" + color_variable_kind = _infer_color_variable_kind(color_variable, data) + + if color_variable_kind == "scalars": + color_data = color_variable + elif color_variable_kind == "transformer": + color_data = color_variable.transform(data) + elif color_variable_kind == "fit_transformer": + color_data = color_variable.fit_transform(data) + elif color_variable_kind == "callable": + color_data = color_variable(data) + elif color_variable_kind == "none": + if is_data_dataframe: + color_data = data.to_numpy() + else: + color_data = data + else: + if is_data_dataframe: + color_data = data[color_variable].to_numpy() + else: + color_data = data[:, color_variable] + + return _get_node_summary(color_data, node_elements, summary_statistic) + + +def _calculate_graph_data( + pipeline, data, is_data_dataframe, layout, layout_dim, color_variable, + node_color_statistic, n_sig_figs, node_scale +): + graph = pipeline.fit_transform(data) + node_elements = graph["node_metadata"]["node_elements"] + + # Determine whether node_color_statistic is an array of node colors + is_node_color_statistic_ndarray = hasattr(node_color_statistic, "dtype") + if not (is_node_color_statistic_ndarray or callable(node_color_statistic)): + raise ValueError( + "`node_color_statistic` must be a callable or ndarray." + ) + + # Compute the raw values of node summary statistics + if is_node_color_statistic_ndarray: + node_colors_color_variable = node_color_statistic + else: + node_colors_color_variable = _get_node_summary_statistics( + data, is_data_dataframe, node_elements, node_color_statistic, + color_variable + ) + + # Load defaults for node and edge traces + plot_options = { + "node_trace": deepcopy(PLOT_OPTIONS_NODE_TRACE_DEFAULTS), + "edge_trace": deepcopy(PLOT_OPTIONS_EDGE_TRACE_DEFAULTS) + } + + # Update size and color of nodes + node_sizes = _get_node_size(node_elements) + plot_options["node_trace"]["marker"].update({ + "size": node_sizes, + "sizeref": _set_node_sizeref(node_sizes, node_scale=node_scale), + "color": node_colors_color_variable + }) + + # Generate hovertext + node_ids = graph["node_metadata"]["node_id"] + num_node_elements = map(len, graph["node_metadata"]["node_elements"]) + node_colors_round = map( + partial(_round_to_n_sig_figs, n=n_sig_figs), node_colors_color_variable + ) + plot_options["node_trace"]["hovertext"] = _get_node_text( + node_ids, num_node_elements, node_colors_round + ) + + # Compute graph layout + is_layout_ndarray = hasattr(layout, "dtype") + if is_layout_ndarray: + if layout.shape[1] not in [2, 3]: + raise ValueError( + f"If an ndarray, `layout` must be 2D with 2 or 3 columns. " + f"Array with {layout.shape[1]} columns passed." + ) + node_pos = layout + else: + if layout_dim not in [2, 3]: + raise ValueError( + f"`layout_dim` must be either 2 or 3. {layout_dim} entered." + ) + node_pos = np.asarray(graph.layout(layout, dim=layout_dim).coords) + + # Store x and y coordinates of edge endpoints + edge_x = list( + reduce( + operator.iconcat, map( + lambda e: [node_pos[e.source, 0], node_pos[e.target, 0], + None], + graph.es + ), [] + ) + ) + edge_y = list( + reduce( + operator.iconcat, map( + lambda e: [node_pos[e.source, 1], node_pos[e.target, 1], + None], + graph.es + ), [] + ) + ) + + if layout_dim == 2: + node_trace = go.Scatter( + x=node_pos[:, 0], y=node_pos[:, 1], **plot_options["node_trace"] + ) + + edge_trace = go.Scatter( + x=edge_x, y=edge_y, **plot_options["edge_trace"] + ) + + else: + node_trace = go.Scatter3d( + x=node_pos[:, 0], y=node_pos[:, 1], z=node_pos[:, 2], + **plot_options["node_trace"] + ) + + edge_z = list( + reduce( + operator.iconcat, map( + lambda e: [node_pos[e.source][2], node_pos[e.target][2], + None], + graph.es + ), [] + ) + ) + edge_trace = go.Scatter3d( + x=edge_x, y=edge_y, z=edge_z, **plot_options["edge_trace"]) + + return edge_trace, node_trace, node_elements, node_colors_color_variable + + +def _hex_to_rgb(value): + """Convert a hex-formatted color to rgb, ignoring alpha values.""" + value = value.lstrip("#") + return [int(value[i:i + 2], 16) for i in range(0, 6, 2)] + + +def _rbg_to_hex(c): + """Convert an rgb-formatted color to hex, ignoring alpha values.""" + return f"#{c[0]:02x}{c[1]:02x}{c[2]:02x}" + + +def _get_colors_for_vals(vals, vmin, vmax, colorscale, return_hex=True): + """Given a float array vals, interpolate based on a colorscale to obtain + rgb or hex colors. Inspired by + `user empet's answer in \ + `_.""" + from numbers import Number + from ast import literal_eval + + if vmin >= vmax: + raise ValueError("`vmin` should be < `vmax`.") + + if (len(colorscale[0]) == 2) and isinstance(colorscale[0][0], Number): + scale, colors = zip(*colorscale) + else: + scale = np.linspace(0, 1, num=len(colorscale)) + colors = colorscale + scale = np.asarray(scale) + + if colors[0][:3] == "rgb": + colors = np.asarray([literal_eval(color[3:]) for color in colors], + dtype=np.float_) + elif colors[0][0] == "#": + colors = np.asarray(list(map(_hex_to_rgb, colors)), dtype=np.float_) + else: + raise ValueError("This colorscale is not supported.") + + colorscale = np.hstack([scale.reshape(-1, 1), colors]) + colorscale = np.vstack([colorscale, colorscale[0, :]]) + colorscale_diffs = np.diff(colorscale, axis=0) + colorscale_diff_ratios = colorscale_diffs[:, 1:] / colorscale_diffs[:, [0]] + colorscale_diff_ratios[-1, :] = np.zeros(3) + + vals_scaled = (vals - vmin) / (vmax - vmin) + + left_bin_indices = np.digitize(vals_scaled, scale) - 1 + left_endpts = colorscale[left_bin_indices] + vals_scaled -= left_endpts[:, 0] + diff_ratios = colorscale_diff_ratios[left_bin_indices] + + vals_rgb = ( + left_endpts[:, 1:] + diff_ratios * vals_scaled[:, np.newaxis] + 0.5 + ).astype(np.uint8) + + if return_hex: + return list(map(_rbg_to_hex, vals_rgb)) + return [f"rgb{tuple(v)}" for v in vals_rgb] diff --git a/gtda/mapper/utils/decorators.py b/gtda/mapper/utils/decorators.py index 48e2e7b05..5c985c197 100644 --- a/gtda/mapper/utils/decorators.py +++ b/gtda/mapper/utils/decorators.py @@ -8,9 +8,12 @@ def method_to_transform(cls, method_name): """Wrap a class to add a :meth:`transform` method as an alias to an existing method. - An example of use is for classes possessing a :meth:`score` method such - as kernel density estimators and anomaly/novelty detection estimators, - to allow for these estimators are to be used as steps in a pipeline. + An example of use is for classes possessing a :meth:`score` method such as + kernel density estimators and anomaly/novelty detection estimators, to + allow for these estimators are to be used as steps in a pipeline. + + Note that 1D array outputs are reshaped into 2D column vectors before + being returned by the new :meth:`transform`. Parameters ---------- @@ -20,20 +23,19 @@ def method_to_transform(cls, method_name): method_name : str Name of the method in `cls` to which :meth:`transform` will be - an alias. The fist argument of this method becomes the `X` - input for :meth:`transform`. + an alias. The fist argument of this method (after ``self``) becomes + the ``X`` input for :meth:`transform`. Returns ------- wrapped_cls : object - New class inheriting from :class:`sklearn.base.TransformerMixin`, - so that a :meth:`fit_transform` is also available. Its name is the - name of `cls` prepended with ``'Extended'``. + New class inheriting from :class:`sklearn.base.TransformerMixin`, so + that both :meth:`transform` and :meth:`fit_transform` are available. + Its name is the name of `cls` prepended with ``'Extended'``. Examples -------- >>> import numpy as np - >>> from numpy.testing import assert_almost_equal >>> from sklearn.neighbors import KernelDensity >>> from gtda.mapper import method_to_transform >>> X = np.random.random((100, 2)) @@ -41,8 +43,12 @@ def method_to_transform(cls, method_name): >>> kde_extended = method_to_transform( ... KernelDensity, 'score_samples')() >>> Xt = kde.fit(X).score_samples(X) + >>> print(Xt.shape) + (100,) >>> Xt_extended = kde_extended.fit_transform(X) - >>> assert_almost_equal(Xt, Xt_extended) + >>> print(Xt_extended.shape) + (100, 1) + >>> np.array_equal(Xt, Xt_extended.flatten()) True """ diff --git a/gtda/mapper/utils/visualization.py b/gtda/mapper/utils/visualization.py deleted file mode 100644 index 1c576b655..000000000 --- a/gtda/mapper/utils/visualization.py +++ /dev/null @@ -1,327 +0,0 @@ -"""Graph layout functions and plotly layout functions.""" -# License: GNU AGPLv3 - -import operator -from functools import reduce - -import numpy as np -import plotly.graph_objs as go -from matplotlib.cm import get_cmap -from matplotlib.colors import rgb2hex - - -def _get_node_size(node_elements): - # TODO: Add doc strings to all functions - return list(map(len, node_elements)) - - -def _get_node_text(graph): - return [ - f"Node ID: {node_id}
Node size: {len(node_elements)}" - for node_id, node_elements in zip( - graph["node_metadata"]["node_id"], - graph["node_metadata"]["node_elements"]) - ] - - -def set_node_sizeref(node_elements, node_scale=12): - # Formula from Plotly https://plot.ly/python/bubble-charts/ - return 2. * max(_get_node_size(node_elements)) / (node_scale ** 2) - - -def get_node_summary(node_elements, data, summary_stat=np.mean): - return np.asarray( - list(map(summary_stat, [data[itr] for itr in node_elements])) - ) - - -def _get_column_color_buttons(data, is_data_dataframe, node_elements, - node_colors_color_variable, colorscale): - # TODO: Consider opting for just-in-time computation instead of computing - # all node summary values ahead-of-time. Solution should preserve scroll - # zoom functionality of 2D static visualisation. - if is_data_dataframe: - columns_to_color = data.columns - else: - columns_to_color = range(data.shape[1]) - - node_color_map = list(map(lambda x: rgb2hex(get_cmap(colorscale)(x)), - node_colors_color_variable)) - - column_color_buttons = [ - dict( - args=[{ - 'marker.color': [None, node_color_map], - 'marker.cmin': [None, np.min(node_colors_color_variable)], - 'marker.cmax': [None, np.max(node_colors_color_variable)], - 'hoverlabel.bgcolor': [None, node_color_map] - }], - label='color_variable', - method='restyle' - ) - ] - - for column in columns_to_color: - if is_data_dataframe: - column_values = data[column].to_numpy() - else: - column_values = data[:, column] - node_colors = get_node_summary(node_elements, column_values) - node_color_map = list(map(lambda x: rgb2hex(get_cmap(colorscale)(x)), - node_colors)) - - column_color_buttons.append( - dict( - args=[{ - 'marker.color': [None, node_color_map], - 'marker.cmin': [None, np.min(node_colors)], - 'marker.cmax': [None, np.max(node_colors)], - 'hoverlabel.bgcolor': [None, node_color_map] - }], - label=f'Column {column}', - method='restyle' - ) - ) - return column_color_buttons - - -def _infer_color_variable_kind(color_variable, data): - """Determines whether color_variable is array, pandas dataframe, callable, - or scikit-learn transformer or fit_transformer.""" - if hasattr(color_variable, 'dtype') or hasattr(color_variable, 'dtypes'): - if len(color_variable) != len(data): - raise ValueError( - "color_variable and data must have the same length.") - color_variable_kind = 'scalars' - elif hasattr(color_variable, 'transform'): - color_variable_kind = 'transformer' - elif hasattr(color_variable, 'fit_transform'): - color_variable_kind = 'fit_transformer' - elif callable(color_variable): - color_variable_kind = 'callable' - elif color_variable is None: - color_variable_kind = 'none' - else: # Assume color_variable is a selection of columns - color_variable_kind = 'else' - - return color_variable_kind - - -def _get_node_colors(data, is_data_dataframe, node_elements, - is_node_colors_ndarray, node_color_statistic, - color_variable, color_variable_kind): - """Calculate node colors""" - if is_node_colors_ndarray: - node_colors = node_color_statistic - else: - if color_variable_kind == 'scalars': - color_data = color_variable - elif color_variable_kind == 'transformer': - color_data = color_variable.transform(data) - elif color_variable_kind == 'fit_transformer': - color_data = color_variable.fit_transform(data) - elif color_variable_kind == 'callable': - color_data = color_variable(data) - elif color_variable_kind == 'none': - if is_data_dataframe: - color_data = data.to_numpy() - else: - color_data = data - else: - if is_data_dataframe: - color_data = data[color_variable].to_numpy() - else: - color_data = data[:, color_variable] - - node_colors = get_node_summary( - node_elements, color_data, summary_stat=node_color_statistic) - - # Check if node_colors contains NaNs - if any(np.logical_not(np.isfinite(node_colors))): - from warnings import warn - warn('NaN values detected in the array of Mapper node colors!' - 'These values will be ignored in the color scale', RuntimeWarning) - - # Normalise node colours in range [0,1] for colorscale mapping - node_colors = (node_colors - np.nanmin(node_colors)) / \ - (np.nanmax(node_colors) - np.nanmin(node_colors)) - - return node_colors - - -def _calculate_graph_data( - pipeline, data, layout, layout_dim, - color_variable, node_color_statistic, plotly_kwargs): - graph = pipeline.fit_transform(data) - node_elements = graph['node_metadata']['node_elements'] - - # Simple duck typing to determine whether data is a pandas dataframe - is_data_dataframe = hasattr(data, 'columns') - - # Determine whether layout is an array of node positions - is_layout_ndarray = hasattr(layout, 'dtype') - if is_layout_ndarray: - node_pos = layout - else: - node_pos = graph.layout(layout, dim=layout_dim) - - color_variable_kind = _infer_color_variable_kind(color_variable, data) - - # Determine whether node_colors is an array of node colors - is_node_colors_ndarray = hasattr(node_color_statistic, 'dtype') - if (not is_node_colors_ndarray) and (not callable(node_color_statistic)): - raise ValueError("node_color_statistic must be a callable or ndarray.") - - node_colors = _get_node_colors( - data, is_data_dataframe, node_elements, - is_node_colors_ndarray, node_color_statistic, - color_variable, color_variable_kind) - - plot_options = { - 'edge_trace_mode': 'lines', - 'edge_trace_line': dict(color='#888', width=1), - 'edge_trace_hoverinfo': 'none', - 'node_trace_mode': 'markers', - 'node_trace_hoverinfo': 'text', - 'node_trace_hoverlabel': dict( - bgcolor=list(map(lambda x: rgb2hex(get_cmap('viridis')(x)), - node_colors))), - 'node_trace_marker_color': list( - map(lambda x: rgb2hex(get_cmap('viridis')(x)), node_colors)), - 'node_trace_marker_colorscale': 'viridis', - 'node_trace_marker_showscale': True, - 'node_trace_marker_reversescale': False, - 'node_trace_marker_line': dict(width=.5, color='#888'), - 'node_trace_marker_size': _get_node_size(node_elements), - 'node_trace_marker_sizemode': 'area', - 'node_trace_marker_sizeref': set_node_sizeref(node_elements), - 'node_trace_marker_sizemin': 4, - 'node_trace_marker_cmin': np.min(node_colors), - 'node_trace_marker_cmax': np.max(node_colors), - 'node_trace_marker_colorbar': dict(thickness=15, - title='', - xanchor='left', - titleside='right'), - 'node_trace_marker_line_width': 2, - 'node_trace_text': _get_node_text(graph), - 'layout_showlegend': False, - 'layout_hovermode': 'closest', - 'layout_xaxis_title': "", - 'layout_yaxis_title': "", - 'layout_title': "", - 'layout_margin': {'b': 20, 'l': 5, 'r': 5, 't': 40}, - 'layout_annotations': list() - } - - if plotly_kwargs is not None: - plot_options.update(plotly_kwargs) - - edge_x = list(reduce(operator.iconcat, - map(lambda x: [node_pos[x[0]][0], - node_pos[x[1]][0], None], - graph.get_edgelist()), [])) - edge_y = list(reduce(operator.iconcat, - map(lambda x: [node_pos[x[0]][1], - node_pos[x[1]][1], None], - graph.get_edgelist()), [])) - - node_x = [node_pos[k][0] for k in range(graph.vcount())] - node_y = [node_pos[k][1] for k in range(graph.vcount())] - - if layout_dim == 2: - plot_options.update({ - 'layout_xaxis': dict(showgrid=False, zeroline=False, - showticklabels=False, ticks="", - showline=False), - 'layout_yaxis': dict(showgrid=False, zeroline=False, - showticklabels=False, ticks="", - showline=False), - }) - edge_trace = go.Scatter( - x=edge_x, - y=edge_y, - line=plot_options['edge_trace_line'], - hoverinfo=plot_options['edge_trace_hoverinfo'], - mode=plot_options['edge_trace_mode']) - - node_trace = go.Scatter( - x=node_x, - y=node_y, - mode=plot_options['node_trace_mode'], - hoverinfo=plot_options['node_trace_hoverinfo'], - hovertext=plot_options['node_trace_text'], - marker=dict( - showscale=plot_options['node_trace_marker_showscale'], - colorscale=plot_options['node_trace_marker_colorscale'], - reversescale=plot_options['node_trace_marker_reversescale'], - line=plot_options['node_trace_marker_line'], - color=list( - map(lambda x: rgb2hex( - get_cmap( - plot_options['node_trace_marker_colorscale'] - )(x)), node_colors)), - size=plot_options['node_trace_marker_size'], - sizemode=plot_options['node_trace_marker_sizemode'], - sizeref=plot_options['node_trace_marker_sizeref'], - sizemin=plot_options['node_trace_marker_sizemin'], - cmin=plot_options['node_trace_marker_cmin'], - cmax=plot_options['node_trace_marker_cmax'], - colorbar=plot_options['node_trace_marker_colorbar'], - line_width=plot_options['node_trace_marker_line_width']), - text=plot_options['node_trace_text']) - elif layout_dim == 3: - plot_options.update({ - 'axis': dict(showbackground=False, - showline=False, - zeroline=False, - showgrid=False, - showticklabels=False, - title='') - }) - plot_options['layout_scene'] = dict(xaxis=dict(plot_options['axis']), - yaxis=dict( - plot_options['axis']), - zaxis=dict( - plot_options['axis']) - ) - - edge_z = list(reduce(operator.iconcat, - map(lambda x: [node_pos[x[0]][2], - node_pos[x[1]][2], None], - graph.get_edgelist()), [])) - - node_z = [node_pos[k][2] for k in range(graph.vcount())] - - edge_trace = go.Scatter3d( - x=edge_x, - y=edge_y, - z=edge_z, - mode=plot_options['edge_trace_mode'], - line=plot_options['edge_trace_line'], - hoverinfo=plot_options['edge_trace_hoverinfo']) - - node_trace = go.Scatter3d( - x=node_x, - y=node_y, - z=node_z, - mode=plot_options['node_trace_mode'], - hoverinfo=plot_options['node_trace_hoverinfo'], - hoverlabel=plot_options['node_trace_hoverlabel'], - hovertext=plot_options['node_trace_text'], - marker=dict( - showscale=plot_options['node_trace_marker_showscale'], - colorscale=plot_options['node_trace_marker_colorscale'], - reversescale=plot_options['node_trace_marker_reversescale'], - line=plot_options['node_trace_marker_line'], - color=plot_options['node_trace_marker_color'], - size=plot_options['node_trace_marker_size'], - sizemode=plot_options['node_trace_marker_sizemode'], - sizeref=plot_options['node_trace_marker_sizeref'], - sizemin=plot_options['node_trace_marker_sizemin'], - cmin=plot_options['node_trace_marker_cmin'], - cmax=plot_options['node_trace_marker_cmax'], - colorbar=plot_options['node_trace_marker_colorbar'], - line_width=plot_options['node_trace_marker_line_width']), - text=plot_options['node_trace_text']) - - return node_trace, edge_trace, node_elements, node_colors, plot_options diff --git a/gtda/mapper/visualization.py b/gtda/mapper/visualization.py index e59215eec..e64641ce2 100644 --- a/gtda/mapper/visualization.py +++ b/gtda/mapper/visualization.py @@ -3,6 +3,8 @@ import logging import traceback +from copy import deepcopy +from warnings import warn import numpy as np import plotly.graph_objects as go @@ -10,30 +12,36 @@ from sklearn.base import clone from .utils._logging import OutputWidgetHandler -from .utils.visualization import (_calculate_graph_data, - _get_column_color_buttons) +from .utils._visualization import ( + _calculate_graph_data, + _get_column_color_buttons, + _get_colors_for_vals, + PLOT_OPTIONS_LAYOUT_DEFAULTS +) def plot_static_mapper_graph( - pipeline, data, layout='kamada_kawai', layout_dim=2, + pipeline, data, layout="kamada_kawai", layout_dim=2, color_variable=None, node_color_statistic=None, - color_by_columns_dropdown=False, plotly_kwargs=None, - clone_pipeline=True): + color_by_columns_dropdown=False, clone_pipeline=True, n_sig_figs=3, + node_scale=12, plotly_params=None +): """Plotting function for static Mapper graphs. - Nodes are colored according to :attr:`color_variable`. By default, the - hovertext displays a globally unique ID and the number of elements - associated with a given node. + Nodes are colored according to `color_variable` and `node_color_statistic`. + By default, the hovertext on each node displays a globally unique ID for + the node, the number of data points associated with the node, and the + summary statistic which determines its color. Parameters ---------- pipeline : :class:`~gtda.mapper.pipeline.MapperPipeline` object - Mapper pipeline to act on to data. + Mapper pipeline to act onto data. data : array-like of shape (n_samples, n_features) Data used to generate the Mapper graph. Can be a pandas dataframe. - layout : None, str or callable, optional, default: ``'kamada-kawai'`` + layout : None, str or callable, optional, default: ``"kamada-kawai"`` Layout algorithm for the graph. Can be any accepted value for the ``layout`` parameter in the :meth:`layout` method of :class:`igraph.Graph`. [1]_ @@ -42,48 +50,80 @@ def plot_static_mapper_graph( The number of dimensions for the layout. Can be 2 or 3. color_variable : object or None, optional, default: ``None`` - Specifies which quantity is to be used for node coloring. + Specifies a feature of interest to be used, together with + `node_color_statistic`, to determine node colors. - 1. If a numpy ndarray or pandas dataframe, `color_variable` - must have the same length as `data` and is interpreted as - a quantity of interest according to which node of the Mapper - graph is to be colored (see `node_color_statistic`). - 2. If ``None`` then equivalent to passing `data`. + 1. If a numpy array or pandas dataframe, it must have the same + length as `data`. + 2. ``None`` is equivalent to passing `data`. 3. If an object implementing :meth:`transform` or - :meth:`fit_transform`, e.g. a scikit-learn estimator or - pipeline, it is applied to `data` to generate the quantity - of interest. - 4. If an index or string, or list of indices / strings, equivalent - to selecting a column or subset of columns from `data`. + :meth:`fit_transform`, it is applied to `data` to generate the + feature of interest. + 4. If an index or string, or list of indices/strings, it is + equivalent to selecting a column or subset of columns from + `data`. node_color_statistic : None, callable, or ndarray of shape (n_nodes,) or \ (n_nodes, 1), optional, default: ``None`` - Specifies how to determine the colors of each node. If a - numpy array, it must have the same length as the number of nodes in - the Mapper graph, and its values are used directly for node - coloring, ignoring `color_variable`. Otherwise, it can be a - callable object which is used to obtain a summary statistic, within - each Mapper node, of the quantity specified by `color_variable`. The - default value ``None`` is equivalent to passing ``numpy.mean``. + If a callable, node colors will be computed as summary statistics from + the feature array ``Y`` determined by `color_variable` – specifically, + the color of a node representing the entries of `data` whose row + indices are in ``I`` will be ``node_color_statistic(Y[I])``. ``None`` + is equivalent to passing :func:`numpy.mean`. If a numpy array, it must + have the same length as the number of nodes in the Mapper graph and its + values are used directly as node colors (`color_variable` is ignored). color_by_columns_dropdown : bool, optional, default: ``False`` If ``True``, a dropdown widget is generated which allows the user to - color Mapper nodes according to any column in `data`. - - plotly_kwargs : dict, optional, default: ``None`` - Keyword arguments to configure the Plotly Figure. + color Mapper nodes according to any column in `data` (still using + `node_color_statistic`) in addition to `color_variable`. clone_pipeline : bool, optional, default: ``True`` If ``True``, the input `pipeline` is cloned before computing the Mapper graph to prevent unexpected side effects from in-place parameter updates. + n_sig_figs : int or None, optional, default: ``3`` + If not ``None``, number of significant figures to which to round node + node summary statistics. If ``None``, no rounding is performed. + + node_scale : int or float, optional, default: ``12`` + Sets the scale factor used to determine the rendered size of the + nodes. Increase for larger nodes. Implements a formula in the + `Plotly documentation \ + `_. + + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"node_trace"``, ``"edge_trace"`` and ``"layout"``, and the + corresponding values should be dictionaries containing keyword + arguments as would be fed to the :meth:`update_traces` and + :meth:`update_layout` methods of :class:`plotly.graph_objects.Figure`. + Returns ------- fig : :class:`plotly.graph_objects.Figure` object Figure representing the Mapper graph with appropriate node colouring and size. + Examples + -------- + Setting a colorscale different from the default one: + + >>> import numpy as np + >>> from gtda.mapper import make_mapper_pipeline, plot_static_mapper_graph + >>> pipeline = make_mapper_pipeline() + >>> data = np.random.random((100, 3)) + >>> plotly_params = {"node_trace": {"marker_colorscale": "Blues"}} + >>> fig = plot_static_mapper_graph(pipeline, data, + ... plotly_params=plotly_params) + + See also + -------- + :func:`~gtda.mapper.visualization.plot_interactive_mapper_graph`, + :func:`~gtda.mapper.pipeline.make_mapper_pipeline` + References ---------- .. [1] `igraph.Graph.layout @@ -93,57 +133,91 @@ def plot_static_mapper_graph( """ # Compute the graph and fetch the indices of points in each node - if clone_pipeline: - pipe = clone(pipeline) - else: - pipe = pipeline + _pipeline = clone(pipeline) if clone_pipeline else pipeline - if node_color_statistic is not None: - _node_color_statistic = node_color_statistic - else: - _node_color_statistic = np.mean + _node_color_statistic = node_color_statistic or np.mean - # Simple duck typing to determine whether data is a pandas dataframe - is_data_dataframe = hasattr(data, 'columns') + # Simple duck typing to determine whether data is likely a pandas dataframe + is_data_dataframe = hasattr(data, "columns") - node_trace, edge_trace, node_elements, _node_colors, plot_options = \ + edge_trace, node_trace, node_elements, node_colors_color_variable = \ _calculate_graph_data( - pipe, data, layout, layout_dim, - color_variable, _node_color_statistic, plotly_kwargs) - - # Define layout options that are common to 2D and 3D figures - layout_options_common = go.Layout( - showlegend=plot_options['layout_showlegend'], - hovermode=plot_options['layout_hovermode'], - margin=plot_options['layout_margin'], - autosize=False + _pipeline, data, is_data_dataframe, layout, layout_dim, + color_variable, _node_color_statistic, n_sig_figs, node_scale + ) + + # Define layout options + layout_options = go.Layout( + **PLOT_OPTIONS_LAYOUT_DEFAULTS["common"], + **PLOT_OPTIONS_LAYOUT_DEFAULTS[layout_dim] ) - fig = go.FigureWidget(data=[edge_trace, node_trace], - layout=layout_options_common) - - if layout_dim == 2: - layout_options_2d = { - 'layout_xaxis': plot_options['layout_xaxis'], - 'layout_xaxis_title': plot_options['layout_xaxis_title'], - 'layout_yaxis': plot_options['layout_yaxis'], - 'layout_yaxis_title': plot_options['layout_yaxis_title'], - 'layout_template': 'simple_white', - } - fig.update(layout_options_2d) - - elif layout_dim == 3: - layout_options_3d = { - 'layout_scene': plot_options['layout_scene'], - 'layout_annotations': plot_options['layout_annotations'], - } - fig.update(layout_options_3d) + fig = go.FigureWidget(data=[edge_trace, node_trace], layout=layout_options) + + _plotly_params = deepcopy(plotly_params) + + # When laying out the graph in 3D, plotly does not automatically give + # the background hoverlabel the same color as the respective marker, + # so we do this by hand here. + # TODO: Extract logic so as to avoid repetitions in interactive version + colorscale_for_hoverlabel = None + if layout_dim == 3: + compute_hoverlabel_bgcolor = True + if _plotly_params: + if "node_trace" in _plotly_params: + if "hoverlabel_bgcolor" in _plotly_params["node_trace"]: + fig.update_traces( + hoverlabel_bgcolor=_plotly_params["node_trace"].pop( + "hoverlabel_bgcolor" + ), + selector={"name": "node_trace"} + ) + compute_hoverlabel_bgcolor = False + if "marker_colorscale" in _plotly_params["node_trace"]: + fig.update_traces( + marker_colorscale=_plotly_params["node_trace"].pop( + "marker_colorscale" + ), + selector={"name": "node_trace"} + ) + + if compute_hoverlabel_bgcolor: + colorscale_for_hoverlabel = fig.data[1].marker.colorscale + node_colors_color_variable = np.asarray(node_colors_color_variable) + min_col = np.min(node_colors_color_variable) + max_col = np.max(node_colors_color_variable) + try: + hoverlabel_bgcolor = _get_colors_for_vals( + node_colors_color_variable, min_col, max_col, + colorscale_for_hoverlabel + ) + except Exception as e: + if e.args[0] == "This colorscale is not supported.": + warn("Data-dependent background hoverlabel colors cannot " + "be generated with this choice of colorscale. Please " + "use a standard hex- or RGB-formatted colorscale.") + else: + warn("Something went wrong in generating data-dependent " + "background hoverlabel colors. All background " + "hoverlabel colors will be set to white.") + hoverlabel_bgcolor = "white" + colorscale_for_hoverlabel = None + fig.update_traces( + hoverlabel_bgcolor=hoverlabel_bgcolor, + selector={"name": "node_trace"} + ) # Compute node colors according to data columns only if necessary if color_by_columns_dropdown: + hovertext_color_variable = node_trace.hovertext column_color_buttons = _get_column_color_buttons( - data, is_data_dataframe, node_elements, _node_colors, - plot_options['node_trace_marker_colorscale']) + data, is_data_dataframe, node_elements, node_colors_color_variable, + _node_color_statistic, hovertext_color_variable, + colorscale_for_hoverlabel, n_sig_figs + ) + # Avoid recomputing hoverlabel bgcolor for top button + column_color_buttons[0]["args"][0]["hoverlabel.bgcolor"] = \ + [None, fig.data[1].hoverlabel.bgcolor] else: column_color_buttons = None @@ -156,7 +230,7 @@ def plot_static_mapper_graph( pad={"r": 10, "t": 10}, showactive=True, x=0.11, - xanchor='left', + xanchor="left", y=button_height, yanchor="top" ), @@ -164,25 +238,43 @@ def plot_static_mapper_graph( if color_by_columns_dropdown: fig.add_annotation( - go.layout.Annotation(text="Color by:", x=0, xref="paper", - y=button_height - 0.045, - yref="paper", align="left", showarrow=False) + go.layout.Annotation( + text="Color by:", + x=0, + xref="paper", + y=button_height - 0.045, + yref="paper", + align="left", + showarrow=False + ) ) + # Update traces and layout according to user input + if _plotly_params: + for key in ["node_trace", "edge_trace"]: + fig.update_traces( + _plotly_params.pop(key, None), + selector={"name": key} + ) + fig.update_layout(_plotly_params.pop("layout", None)) + return fig -def plot_interactive_mapper_graph(pipeline, data, layout='kamada_kawai', - layout_dim=2, color_variable=None, - node_color_statistic=None, - color_by_columns_dropdown=False, - plotly_kwargs=None): +def plot_interactive_mapper_graph( + pipeline, data, layout="kamada_kawai", layout_dim=2, + color_variable=None, node_color_statistic=None, clone_pipeline=True, + color_by_columns_dropdown=False, n_sig_figs=3, node_scale=12, + plotly_params=None +): """Plotting function for interactive Mapper graphs. Provides functionality to interactively update parameters from the cover - and clustering steps defined in :attr:`pipeline`. Nodes are colored - according to :attr:`color_variable`. By default, the hovertext displays a - globally unique ID and the number of elements associated with a given node. + and clustering steps defined in `pipeline`. Nodes are colored according to + `color_variable` and `node_color_statistic`. By default, the hovertext on + each node displays a globally unique ID for the node, the number of data + points associated with the node, and the summary statistic which determines + its color. Parameters ---------- @@ -192,7 +284,7 @@ def plot_interactive_mapper_graph(pipeline, data, layout='kamada_kawai', data : array-like of shape (n_samples, n_features) Data used to generate the Mapper graph. Can be a pandas dataframe. - layout : None, str or callable, optional, default: ``'kamada-kawai'`` + layout : None, str or callable, optional, default: ``"kamada-kawai"`` Layout algorithm for the graph. Can be any accepted value for the ``layout`` parameter in the :meth:`layout` method of :class:`igraph.Graph`. [1]_ @@ -201,43 +293,64 @@ def plot_interactive_mapper_graph(pipeline, data, layout='kamada_kawai', The number of dimensions for the layout. Can be 2 or 3. color_variable : object or None, optional, default: ``None`` - Specifies which quantity is to be used for node coloring. + Specifies a feature of interest to be used, together with + `node_color_statistic`, to determine node colors. - 1. If a numpy ndarray or pandas dataframe, `color_variable` - must have the same length as `data` and is interpreted as - a quantity of interest according to which node of the Mapper - graph is to be colored (see `node_color_statistic`). - 2. If ``None`` then equivalent to passing `data`. + 1. If a numpy array or pandas dataframe, it must have the same + length as `data`. + 2. ``None`` is equivalent to passing `data`. 3. If an object implementing :meth:`transform` or - :meth:`fit_transform`, e.g. a scikit-learn estimator or - pipeline, it is applied to `data` to generate the quantity - of interest. - 4. If an index or string, or list of indices / strings, equivalent - to selecting a column or subset of columns from `data`. - - node_color_statistic :None, callable, or ndarray of shape (n_nodes,) or \ - (n_nodes, 1), optional, default: ``None`` - Specifies how to determine the colors of each node. If a - numpy array, it must have the same length as the number of nodes in - the Mapper graph, and its values are used directly for node - coloring, ignoring `color_variable`. Otherwise, it can be a - callable object which is used to obtain a summary statistic, within - each Mapper node, of the quantity specified by `color_variable`. The - default value ``None`` is equivalent to passing ``numpy.mean``. + :meth:`fit_transform`, it is applied to `data` to generate the + feature of interest. + 4. If an index or string, or list of indices/strings, it is + equivalent to selecting a column or subset of columns from + `data`. + + node_color_statistic : callable or None, optional, default: ``None`` + If a callable, node colors will be computed as summary statistics from + the feature array ``Y`` determined by `color_variable` – specifically, + the color of a node representing the entries of `data` whose row + indices are in ``I`` will be ``node_color_statistic(Y[I])``. ``None`` + is equivalent to passing :func:`numpy.mean`. color_by_columns_dropdown : bool, optional, default: ``False`` If ``True``, a dropdown widget is generated which allows the user to - color Mapper nodes according to any column in `data`. + color Mapper nodes according to any column in `data` (still using + `node_color_statistic`) in addition to `color_variable`. + + clone_pipeline : bool, optional, default: ``True`` + If ``True``, the input `pipeline` is cloned before computing the + Mapper graph to prevent unexpected side effects from in-place + parameter updates. + + n_sig_figs : int or None, optional, default: ``3`` + If not ``None``, number of significant figures to which to round node + node summary statistics. If ``None``, no rounding is performed. + + node_scale : int or float, optional, default: ``12`` + Sets the scale factor used to determine the rendered size of the + nodes. Increase for larger nodes. Implements a formula in the + `Plotly documentation \ + `_. - plotly_kwargs : dict, optional, default: ``None`` - Keyword arguments to configure the Plotly Figure. + plotly_params : dict or None, optional, default: ``None`` + Custom parameters to configure the plotly figure. Allowed keys are + ``"node_trace"``, ``"edge_trace"`` and ``"layout"``, and the + corresponding values should be dictionaries containing keyword + arguments as would be fed to the :meth:`update_traces` and + :meth:`update_layout` methods of :class:`plotly.graph_objects.Figure`. Returns ------- box : :class:`ipywidgets.VBox` object - A box containing the following widgets: parameters of the clustering - algorithm, parameters for the covering scheme, a Mapper graph arising - from those parameters, a validation box, and logs. + A box containing the following widgets: parameters of the clustering + algorithm, parameters for the covering scheme, a Mapper graph arising + from those parameters, a validation box, and logs. + + See also + -------- + :func:`~gtda.mapper.visualization.plot_static_mapper_graph`, + :func:`~gtda.mapper.pipeline.make_mapper_pipeline` References ---------- @@ -247,20 +360,17 @@ def plot_interactive_mapper_graph(pipeline, data, layout='kamada_kawai', """ - # clone pipeline to avoid side effects from in-place parameter changes - pipe = clone(pipeline) + # Clone pipeline to avoid side effects from in-place parameter changes + _pipeline = clone(pipeline) if clone_pipeline else pipeline - if node_color_statistic is not None: - _node_color_statistic = node_color_statistic - else: - _node_color_statistic = np.mean + _node_color_statistic = node_color_statistic or np.mean def get_widgets_per_param(param, value): if isinstance(value, float): return (param, widgets.FloatText( value=value, step=0.05, - description=param.split('__')[1], + description=param.split("__")[1], continuous_update=False, disabled=False )) @@ -268,65 +378,89 @@ def get_widgets_per_param(param, value): return (param, widgets.IntText( value=value, step=1, - description=param.split('__')[1], + description=param.split("__")[1], continuous_update=False, disabled=False )) elif isinstance(value, str): return (param, widgets.Text( value=value, - description=param.split('__')[1], + description=param.split("__")[1], continuous_update=False, disabled=False )) else: return None - def update_figure(figure, edge_trace, node_trace, layout_dim): - figure.data[0].x = edge_trace.x - figure.data[0].y = edge_trace.y - figure.data[1].x = node_trace.x - figure.data[1].y = node_trace.y - - if layout_dim == 3: - figure.data[0].z = edge_trace.z - figure.data[1].z = node_trace.z - - figure.data[1].marker.size = node_trace.marker.size - figure.data[1].marker.color = node_trace.marker.color - figure.data[1].marker.cmin = node_trace.marker.cmin - figure.data[1].marker.cmax = node_trace.marker.cmax - figure.data[1].marker.sizeref = node_trace.marker.sizeref - figure.data[1].hoverlabel = node_trace.hoverlabel - figure.data[1].hovertext = node_trace.hovertext - def on_parameter_change(change): handler.clear_logs() try: for param, value in cover_params.items(): if isinstance(value, (int, float, str)): - pipe.set_params( - **{param: cover_params_widgets[param].value}) + _pipeline.set_params( + **{param: cover_params_widgets[param].value} + ) for param, value in cluster_params.items(): if isinstance(value, (int, float, str)): - pipe.set_params( - **{param: cluster_params_widgets[param].value}) + _pipeline.set_params( + **{param: cluster_params_widgets[param].value} + ) - logger.info("Updating figure ...") + logger.info("Updating figure...") with fig.batch_update(): - (node_trace, edge_trace, node_elements, node_colors, - plot_options) = _calculate_graph_data( - pipe, data, layout, layout_dim, - color_variable, _node_color_statistic, plotly_kwargs + ( + edge_trace, node_trace, node_elements, + node_colors_color_variable + ) = _calculate_graph_data( + _pipeline, data, is_data_dataframe, layout, layout_dim, + color_variable, _node_color_statistic, n_sig_figs, + node_scale + ) + if colorscale_for_hoverlabel is not None: + node_colors_color_variable = np.asarray( + node_colors_color_variable + ) + min_col = np.min(node_colors_color_variable) + max_col = np.max(node_colors_color_variable) + hoverlabel_bgcolor = _get_colors_for_vals( + node_colors_color_variable, min_col, max_col, + colorscale_for_hoverlabel + ) + fig.update_traces( + hoverlabel_bgcolor=hoverlabel_bgcolor, + selector={"name": "node_trace"} + ) + + fig.update_traces( + x=node_trace.x, + y=node_trace.y, + marker_color=node_trace.marker.color, + marker_size=node_trace.marker.size, + marker_sizeref=node_trace.marker.sizeref, + hovertext=node_trace.hovertext, + **({"z": node_trace.z} if layout_dim == 3 else dict()), + selector={"name": "node_trace"} + ) + fig.update_traces( + x=edge_trace.x, + y=edge_trace.y, + **({"z": edge_trace.z} if layout_dim == 3 else dict()), + selector={"name": "edge_trace"} ) - update_figure(fig, edge_trace, node_trace, layout_dim) # Update color by column buttons - is_data_dataframe = hasattr(data, 'columns') if color_by_columns_dropdown: + hovertext_color_variable = node_trace.hovertext column_color_buttons = _get_column_color_buttons( - data, is_data_dataframe, node_elements, node_colors, - plot_options['node_trace_marker_colorscale']) + data, is_data_dataframe, node_elements, + node_colors_color_variable, _node_color_statistic, + hovertext_color_variable, colorscale_for_hoverlabel, + n_sig_figs + ) + # Avoid recomputing hoverlabel bgcolor for top button + if colorscale_for_hoverlabel is not None: + column_color_buttons[0]["args"][0][ + "hoverlabel.bgcolor"] = [None, hoverlabel_bgcolor] else: column_color_buttons = None @@ -339,7 +473,7 @@ def on_parameter_change(change): pad={"r": 10, "t": 10}, showactive=True, x=0.11, - xanchor='left', + xanchor="left", y=button_height, yanchor="top" ), @@ -354,9 +488,9 @@ def on_parameter_change(change): def observe_widgets(params, widgets): for param, value in params.items(): if isinstance(value, (int, float, str)): - widgets[param].observe(on_parameter_change, names='value') + widgets[param].observe(on_parameter_change, names="value") - # define output widget to capture logs + # Define output widget to capture logs out = widgets.Output() @out.capture() @@ -367,63 +501,102 @@ def click_box(change): else: out.clear_output() - # initialise logging + # Initialise logging logger = logging.getLogger(__name__) handler = OutputWidgetHandler() handler.setFormatter(logging.Formatter( - '%(asctime)s - [%(levelname)s] %(message)s')) + "%(asctime)s - [%(levelname)s] %(message)s")) logger.addHandler(handler) logger.setLevel(logging.INFO) - # initialise cover and cluster dictionaries of parameters and widgets - cover_params = dict(filter(lambda x: x[0].startswith('cover'), - pipe.get_mapper_params().items())) + # Initialise cover and cluster dictionaries of parameters and widgets + cover_params = dict( + filter( + lambda x: x[0].startswith("cover"), + _pipeline.get_mapper_params().items() + ) + ) cover_params_widgets = dict( - filter(None, map(lambda x: get_widgets_per_param(*x), - cover_params.items()))) - cluster_params = dict(filter(lambda x: x[0].startswith('clusterer'), - pipe.get_mapper_params().items())) + filter( + None, map( + lambda x: get_widgets_per_param(*x), + cover_params.items() + ) + ) + ) + cluster_params = dict( + filter( + lambda x: x[0].startswith("clusterer"), + _pipeline.get_mapper_params().items() + ) + ) cluster_params_widgets = dict( - filter(None, map(lambda x: get_widgets_per_param(*x), - cluster_params.items()))) + filter( + None, map( + lambda x: get_widgets_per_param(*x), + cluster_params.items() + ) + ) + ) - # initialise widgets for validating input parameters of pipeline + # Initialise widgets for validating input parameters of pipeline valid = widgets.Valid( value=True, - description='Valid parameters', - style={'description_width': '100px'}, + description="Valid parameters", + style={"description_width": "100px"}, ) - # initialise widget for showing the logs + # Initialise widget for showing the logs logs_box = widgets.Checkbox( - description='Show logs: ', + description="Show logs: ", value=False, indent=False ) - # initialise figure with initial pipeline and config - if plotly_kwargs is None: - plotly_kwargs = dict() - + # Initialise figure with initial pipeline and config fig = plot_static_mapper_graph( - pipe, data, layout, layout_dim, color_variable, _node_color_statistic, - color_by_columns_dropdown, plotly_kwargs, clone_pipeline=False) + _pipeline, data, layout=layout, layout_dim=layout_dim, + color_variable=color_variable, + node_color_statistic=_node_color_statistic, + color_by_columns_dropdown=color_by_columns_dropdown, + clone_pipeline=False, n_sig_figs=n_sig_figs, node_scale=node_scale, + plotly_params=plotly_params + ) + + # Store variables for later updates + is_data_dataframe = hasattr(data, "columns") + + colorscale_for_hoverlabel = None + if layout_dim == 3: + # In plot_static_mapper_graph, hoverlabel bgcolors are set to white if + # something goes wrong computing them according to the colorscale. + is_bgcolor_not_white = fig.data[1].hoverlabel.bgcolor != "white" + user_hoverlabel_bgcolor = False + if plotly_params: + if "node_trace" in plotly_params: + if "hoverlabel_bgcolor" in plotly_params["node_trace"]: + user_hoverlabel_bgcolor = True + if is_bgcolor_not_white and not user_hoverlabel_bgcolor: + colorscale_for_hoverlabel = fig.data[1].marker.colorscale observe_widgets(cover_params, cover_params_widgets) observe_widgets(cluster_params, cluster_params_widgets) - logs_box.observe(click_box, names='value') + logs_box.observe(click_box, names="value") - # define containers for input widgets + # Define containers for input widgets container_cover = widgets.HBox( - children=list(cover_params_widgets.values())) + children=list(cover_params_widgets.values()) + ) - container_cluster_layout = Layout(display='flex', flex_flow='row wrap') + container_cluster_layout = Layout(display="flex", flex_flow="row wrap") container_cluster = widgets.HBox( children=list(cluster_params_widgets.values()), - layout=container_cluster_layout) + layout=container_cluster_layout + ) box = widgets.VBox( - [container_cover, container_cluster, fig, valid, logs_box, out]) + [container_cover, container_cluster, fig, valid, logs_box, out] + ) return box diff --git a/requirements.txt b/requirements.txt index 83b633f7d..320ecb400 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,5 @@ scipy >= 0.17.0 joblib >= 0.13 scikit-learn >= 0.22.0 python-igraph >= 0.7.1.post6 -matplotlib >= 3.0.3 plotly >= 4.4.1 ipywidgets >= 7.5.1 \ No newline at end of file diff --git a/setup.py b/setup.py index a90c4784c..1c2dd5e0e 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ MAINTAINER_EMAIL = 'maintainers@giotto.ai' URL = 'https://github.com/giotto-ai/giotto-tda' LICENSE = 'GNU AGPLv3' -DOWNLOAD_URL = 'https://github.com/giotto-ai/giotto-tda/tarball/v0.2.1' +DOWNLOAD_URL = 'https://github.com/giotto-ai/giotto-tda/tarball/v0.2.2' VERSION = __version__ # noqa CLASSIFIERS = ['Intended Audience :: Science/Research', 'Intended Audience :: Developers', @@ -67,7 +67,8 @@ 'examples': [ 'jupyter', 'pandas', - 'openml'] + 'openml', + 'matplotlib'] }