
.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_examples/example_sparse_group_lasso.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        Click :ref:`here <sphx_glr_download_auto_examples_example_sparse_group_lasso.py>`
        to download the full example code

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_examples_example_sparse_group_lasso.py:


GroupLasso for linear regression with dummy variables
=====================================================

A sample script for group lasso with dummy variables

.. GENERATED FROM PYTHON SOURCE LINES 9-11

Setup
-----

.. GENERATED FROM PYTHON SOURCE LINES 11-26

.. code-block:: default


    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.linear_model import Ridge
    from sklearn.metrics import r2_score
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder

    from group_lasso import GroupLasso
    from group_lasso.utils import extract_ohe_groups

    np.random.seed(42)
    GroupLasso.LOG_LOSSES = True









.. GENERATED FROM PYTHON SOURCE LINES 27-29

Set dataset parameters
----------------------

.. GENERATED FROM PYTHON SOURCE LINES 29-36

.. code-block:: default

    num_categories = 30
    min_options = 2
    max_options = 10
    num_datapoints = 10000
    noise_std = 1









.. GENERATED FROM PYTHON SOURCE LINES 37-39

Generate data matrix
--------------------

.. GENERATED FROM PYTHON SOURCE LINES 39-50

.. code-block:: default

    X_cat = np.empty((num_datapoints, num_categories))
    for i in range(num_categories):
        X_cat[:, i] = np.random.randint(min_options, max_options, num_datapoints)

    ohe = OneHotEncoder()
    X = ohe.fit_transform(X_cat)
    groups = extract_ohe_groups(ohe)
    group_sizes = [np.sum(groups == g) for g in np.unique(groups)]
    active_groups = [np.random.randint(0, 2) for _ in np.unique(groups)]









.. GENERATED FROM PYTHON SOURCE LINES 51-53

Generate coefficients
---------------------

.. GENERATED FROM PYTHON SOURCE LINES 53-64

.. code-block:: default

    w = np.concatenate(
        [
            np.random.standard_normal(group_size) * is_active
            for group_size, is_active in zip(group_sizes, active_groups)
        ]
    )
    w = w.reshape(-1, 1)
    true_coefficient_mask = w != 0
    intercept = 2









.. GENERATED FROM PYTHON SOURCE LINES 65-67

Generate regression targets
---------------------------

.. GENERATED FROM PYTHON SOURCE LINES 67-71

.. code-block:: default

    y_true = X @ w + intercept
    y = y_true + np.random.randn(*y_true.shape) * noise_std









.. GENERATED FROM PYTHON SOURCE LINES 72-74

View noisy data and compute maximum R^2
---------------------------------------

.. GENERATED FROM PYTHON SOURCE LINES 74-83

.. code-block:: default

    plt.figure()
    plt.plot(y, y_true, ".")
    plt.xlabel("Noisy targets")
    plt.ylabel("Noise-free targets")
    # Use noisy y as true because that is what we would have access
    # to in a real-life setting.
    R2_best = r2_score(y, y_true)





.. image:: /auto_examples/images/sphx_glr_example_sparse_group_lasso_001.png
    :alt: example sparse group lasso
    :class: sphx-glr-single-img





.. GENERATED FROM PYTHON SOURCE LINES 84-86

Generate pipeline and train it
------------------------------

.. GENERATED FROM PYTHON SOURCE LINES 86-107

.. code-block:: default

    pipe = pipe = Pipeline(
        memory=None,
        steps=[
            (
                "variable_selection",
                GroupLasso(
                    groups=groups,
                    group_reg=0.1,
                    l1_reg=0,
                    scale_reg=None,
                    supress_warning=True,
                    n_iter=100000,
                    frobenius_lipschitz=False,
                ),
            ),
            ("regressor", Ridge(alpha=1)),
        ],
    )
    pipe.fit(X, y)






.. rst-class:: sphx-glr-script-out

 Out:

 .. code-block:: none


    Pipeline(steps=[('variable_selection',
                     GroupLasso(group_reg=0.1,
                                groups=array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,
            1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  3.,  3.,
            3.,  3.,  3.,  3.,  3.,  3.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
            4.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  6.,  6.,  6.,  6.,
            6.,  6.,  6.,  6.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  8.,
            8.,  8.,  8.,  8.,  8.,  8.,  8.,  9.,  9.,  9.,  9.,  9.,  9.,
            9.,  9., 10., 10., 10., 10., 10., 10., 10., 10., 1...
           21., 21., 21., 21., 21., 21., 21., 22., 22., 22., 22., 22., 22.,
           22., 22., 23., 23., 23., 23., 23., 23., 23., 23., 24., 24., 24.,
           24., 24., 24., 24., 24., 25., 25., 25., 25., 25., 25., 25., 25.,
           26., 26., 26., 26., 26., 26., 26., 26., 27., 27., 27., 27., 27.,
           27., 27., 27., 28., 28., 28., 28., 28., 28., 28., 28., 29., 29.,
           29., 29., 29., 29., 29., 29.]),
                                l1_reg=0, n_iter=100000, scale_reg=None,
                                supress_warning=True)),
                    ('regressor', Ridge(alpha=1))])



.. GENERATED FROM PYTHON SOURCE LINES 108-110

Extract results and compute performance metrics
-----------------------------------------------

.. GENERATED FROM PYTHON SOURCE LINES 110-128

.. code-block:: default


    # Extract from pipeline
    yhat = pipe.predict(X)
    sparsity_mask = pipe["variable_selection"].sparsity_mask_
    coef = pipe["regressor"].coef_.T

    # Construct full coefficient vector
    w_hat = np.zeros_like(w)
    w_hat[sparsity_mask] = coef

    R2 = r2_score(y, yhat)

    # Print performance metrics
    print(f"Number variables: {len(sparsity_mask)}")
    print(f"Number of chosen variables: {sparsity_mask.sum()}")
    print(f"R^2: {R2}, best possible R^2 = {R2_best}")






.. rst-class:: sphx-glr-script-out

 Out:

 .. code-block:: none

    Number variables: 240
    Number of chosen variables: 144
    R^2: 0.9278329616431523, best possible R^2 = 0.9394648554757948




.. GENERATED FROM PYTHON SOURCE LINES 129-131

Visualise regression coefficients
---------------------------------

.. GENERATED FROM PYTHON SOURCE LINES 131-142

.. code-block:: default

    for i in range(w.shape[1]):
        plt.figure()
        plt.plot(w[:, i], ".", label="True weights")
        plt.plot(w_hat[:, i], ".", label="Estimated weights")

    plt.figure()
    plt.plot([w.min(), w.max()], [coef.min(), coef.max()], "gray")
    plt.scatter(w, w_hat, s=10)
    plt.ylabel("Learned coefficients")
    plt.xlabel("True coefficients")
    plt.show()



.. rst-class:: sphx-glr-horizontal


    *

      .. image:: /auto_examples/images/sphx_glr_example_sparse_group_lasso_002.png
          :alt: example sparse group lasso
          :class: sphx-glr-multi-img

    *

      .. image:: /auto_examples/images/sphx_glr_example_sparse_group_lasso_003.png
          :alt: example sparse group lasso
          :class: sphx-glr-multi-img






.. rst-class:: sphx-glr-timing

   **Total running time of the script:** ( 0 minutes  39.015 seconds)


.. _sphx_glr_download_auto_examples_example_sparse_group_lasso.py:


.. only :: html

 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example



  .. container:: sphx-glr-download sphx-glr-download-python

     :download:`Download Python source code: example_sparse_group_lasso.py <example_sparse_group_lasso.py>`



  .. container:: sphx-glr-download sphx-glr-download-jupyter

     :download:`Download Jupyter notebook: example_sparse_group_lasso.ipynb <example_sparse_group_lasso.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
