Source code for sconce.utils

# -*- coding: utf-8 -*-

# Author: Yikun Zhang
# Last Editing: June 26, 2022

# Description: This script contains the utility functions for using our package 
# in practice.

import numpy as np
from numpy import linalg as LA

#================================================================================#

[docs]def cart2sph(x, y, z):
    '''
    Converting the Euclidean coordinate of a data point in R^3 to its Spherical 
    coordinates.
    
    Parameters
    ----------
        x, y, z: floats 
            Euclidean coordinate in R^3 of a data point.
    
    Returns
    ----------
        theta: float
            Longitude (ranging from -180 degree to 180 degree).
        phi: float
            Latitude (ranging from -90 degree to 90 degree).
        r: float 
            Radial distance from the origin to the data point.
    '''
    dxy = np.sqrt(x**2 + y**2)
    r = np.sqrt(dxy**2 + z**2)
    theta = np.arctan2(y, x)
    phi = np.arctan2(z, dxy)
    theta, phi = np.rad2deg([theta, phi])
    return theta, phi, r

[docs]def sph2cart(theta, phi, r=1):
    '''
    Converting the Spherical coordinate of a data point to its Euclidean 
    coordinate in R^3.
    
    Parameters
    ----------
        theta: float
            Longitude (ranging from -180 degree to 180 degree).
        phi: float
            Latitude (ranging from -90 degree to 90 degree).
        r: float 
            Radial distance from the origin to the data point.
            
    Returns
    ----------
        x, y, z: floats 
            Euclidean coordinate in R^3 of a data point.
    '''
    theta, phi = np.deg2rad([theta, phi])
    z = r * np.sin(phi)
    rcosphi = r * np.cos(phi)
    x = rcosphi * np.cos(theta)
    y = rcosphi * np.sin(theta)
    return x, y, z

[docs]def CirSphSampling(N, lat_c=60, lon_range=[-180,180], sigma=0.01, 
                   pv_ax=np.array([0,0,1])):
    '''
    Generating data points from a circle on the unit sphere with additive 
    Gaussian noises to their Cartesian coordinates plus L2 normalizations.
    
    Parameters
    ----------
        N: int
            The number of randomly generated data points.
            
        lat_c: float
            The latitude of the circle with respect to the pivotal axis. 
            (range: 0-90)
            
        lon_range: 2-element list
            The longitude range that the circular structure covers. When 
            "lon_range=[-180,180]", the underlying structure is a full circle.
            
        sigma: float
            The standard deviation of Gaussian noises.
    
        pv_ax: (3,)-array
            The pivotal axis of the circle on the sphere from which the data 
            points are generated (plus noises).
            
    Return
    ----------
        pts_c_noise: (N,3)-array
            The Cartesian coordinates of N simulated data points.
    
    '''
    ## Random longitudes with range (0, 180)
    lon_c = np.random.rand(N,)*(lon_range[1]-lon_range[0]) + lon_range[0]
    lat_c = np.ones((N,))*lat_c
    x_c, y_c, z_c = sph2cart(lon_c, lat_c)

    pts_c = np.concatenate((x_c.reshape(len(x_c), 1), 
                            y_c.reshape(len(y_c), 1),
                            z_c.reshape(len(z_c), 1)), axis=1)
    ## Add Gaussian noises
    pts_c_noise = pts_c + sigma * np.random.randn(pts_c.shape[0], pts_c.shape[1])
    ## Standardize the noisy points
    pts_c_noise = pts_c_noise/np.sqrt(np.sum(pts_c_noise**2, axis=1)).reshape(N,1)
    
    ## Rotate the data samples accordingly
    mu_c = np.array([[0,0,1]])
    R = 2*np.dot(pv_ax.reshape(3,1)+mu_c.T, pv_ax.reshape(1,3)+mu_c)/\
        np.sum((mu_c+pv_ax.reshape(1,3))**2, axis=1) - np.identity(3)
    pts_c_noise = np.dot(R, pts_c_noise.T).T
    return pts_c_noise


[docs]def GaussMixture(N, mu=np.array([[1,1]]), cov=np.diag([1,1]).reshape(2,2,1), 
                 prob=[1.0]):
    '''
    Generating data points from a Gaussian mixture model.
    
    Parameters
    ----------
        N: int
            The number of randomly generated data points.
    
        mu: (m,d)-array
            The means of the Gaussian mixture model with m components.
       
        cov: (d,d,m)-array
            The (d,d)-covariance matrices of the Gaussian mixture model with 
            m components.
            
        prob: list of floats
            The mixture probabilities.
    
    Return
    ----------
        data_ps: (N,d)-array
            The Cartesian coordinates of N simulated data points.
    '''
    m = len(prob)   ## The number of mixtures
    d = mu.shape[1]  ## Dimension of the data
    assert (cov.shape[2] == len(prob)), "'cov.shape[2]' and 'len(prob)' "\
    "should be equal."
    inds = np.random.choice(list(range(m)), N, replace=True, 
                            p=np.array(prob)/sum(prob))
    data_ps = np.zeros((N,d))
    for i in range(m):
        data_ps[inds == i,:] = np.random.multivariate_normal(mu[i,:], cov[:,:,i], 
                                                             size=sum(inds == i))
    return data_ps


[docs]def vMFDensity(x, mu=np.array([[0,0,1]]), kappa=[1.0], prob=[1.0]):
    '''
    The q-dimensional von-Mises Fisher (vMF) density function or its mixture.
    
    Parameters
    ----------
        x: (n,d)-array
            The Eulidean coordinates of n query points on a unit hypersphere, 
            where d=q+1 is the Euclidean dimension of data.
    
        mu: (m,d)-array
            The Euclidean coordinates of the m mean directions for a mixture of 
            vMF densities.
       
        kappa: list of floats
            The concentration parameters for a mixture of vMF densities.
       
        prob: list of floats
            The mixture probabilities.
            
    Return
    --------
        mix_den: (n,)-array
            The corresponding density value on each query point.
    '''
    assert (mu.shape[1] == x.shape[1] and mu.shape[0] == len(prob)), \
    "The parameter 'x' and mu' should be a (n,d)-array and (m,d)-array, respectively, \
    and 'prob' should be a list of length m."
    assert (len(kappa) == len(prob)), "The parameters 'kappa' and 'prob' should \
    be of the same length."
    d = x.shape[1]   ## Euclidean dimension of the data
    prob = np.array(prob).reshape(len(prob), 1)
    kappa = np.array(kappa)
    dens = kappa**(d/2-1)*np.exp(kappa*np.dot(x, mu.T))/((2*np.pi)**(d/2)*sp.iv(d/2-1, kappa))
    mix_den = np.dot(dens, prob)
    return mix_den


[docs]def vMFSamp(n, mu=np.array([0,0,1]), kappa=1):
    '''
    Randomly sampling data points from a q-dimensional von-Mises Fisher (vMF) 
    density with analytic approaches.
    
    Parameters
    ---------
        n: int
            The number of sampling random data points.
        
        mu: (d, )-array
            The Euclidean coordinate of the mean directions of the q-dim vMF
            density, where d=q+1.
            
        kappa: float
            The concentration parameter of the vMF density.
    
    Return
    --------
        data_ps: (n, d)-array
            The Euclidean coordinates of the randomly sampled points from the 
            vMF density.
    '''
    
    d = len(mu)   # Euclidean dimension of the data
    if d == 3:    # when d=3 (S^2 case), no rejection sampling is required
        U1 = np.random.rand(n,1)
        U2 = 2*np.pi*np.random.rand(n,1)
        W = 1 + np.log(U1 + (1-U1)*np.exp(-2*kappa))/kappa
        X = np.cos(U2) * np.sqrt(1-W**2)
        Y = np.sin(U2) * np.sqrt(1-W**2)
        data_ps = np.concatenate((X,Y,W), axis=1)
    else:
        t1 = np.sqrt(4*(kappa**2) + (d-1)**2)
        b = (-2*kappa + t1)/(d-1)
        x0 = (1-b)/(1+b)
        m = (d-1)/2
        c = kappa*x0 + (d-1)*np.log(1-x0**2)
        data_ps = np.zeros((n,d))
        for i in range(n):
            t = -1000
            U = 1
            while t < np.log(U) + c:
                z = np.random.beta(a=m, b=m)
                U = np.random.rand(1,)[0]
                w = (1-(1+b)*z)/(1-(1-b)*z)
                t = kappa*w + (d-1)*np.log(1-x0*w)
            # Generate a vector from the uniform distribution on the unit hypersphere
            V = np.random.multivariate_normal(mean=np.zeros((d-1,)), 
                                              cov=np.identity(d-1), size=1)[0]
            V = V/np.sqrt(np.sum(V**2))
            data_ps[i, 0:(d-1)] = np.sqrt(1-w**2)*V
            data_ps[i, d-1] = w
    # Rotate the data samples accordingly
    mu_c = np.zeros((1,d))
    mu_c[0,d-1] = 1
    R = 2*np.dot(mu.reshape(d,1)+mu_c.T, mu.reshape(1,d)+mu_c) / \
        np.sum((mu_c+mu.reshape(1,d))**2, axis=1) - np.identity(d)
    data_ps = np.dot(R, data_ps.T).T
    return data_ps


[docs]def vMFRejectSamp(n, mu=np.array([0,0,1]), kappa=1):
    '''
    Randomly sampling data points from a q-dimensional von-Mises Fisher (vMF) 
    density via rejection sampling.
    
    Parameters
    ---------
        n: int
            The number of sampling random data points.
        
        mu: (d, )-array
            The Euclidean coordinate of the mean directions of the q-dim vMF
            density, where d=q+1.
            
        kappa: float
            The concentration parameter of the vMF density.
    
    Return
    --------
        data_ps: (n, d)-array
            The Euclidean coordinates of the randomly sampled points from the 
            vMF density.
    '''
    d = len(mu)   ## Euclidean dimension of the data
    data_ps = np.zeros((n,d))
    ## Sample points from standard normal and then standardize them
    sam_can = np.random.multivariate_normal(mean=np.zeros((d,)), cov=np.identity(d), size=n)
    dist_sam = np.sqrt(np.sum(sam_can**2, axis=1)).reshape(n,1)
    sam_can = sam_can/dist_sam

    unif_sam = np.random.uniform(0, 1, n)
    ## Reject some inadequate data points  
    ## (When the uniform proposal density is used, the normalizing constant in 
    ## front of the vMF density has no effects in rejection sampling.)
    mu = mu.reshape(d,1)
    sams = sam_can[unif_sam < np.exp(kappa*(np.dot(sam_can, mu)-1))[:,0],:]
    cnt = sams.shape[0]
    data_ps[:cnt,:] = sams
    while cnt < n:
        can_p = np.random.multivariate_normal(mean=np.zeros((d,)), cov=np.identity(d), size=1)
        can_p = can_p/np.sqrt(np.sum(can_p**2))
        unif_p = np.random.uniform(0, 1, 1)
        if np.exp(kappa*(np.dot(can_p, mu)-1)) > unif_p:
            data_ps[cnt,:] = can_p
            cnt += 1
    return data_ps


[docs]def vMFMixtureSamp(n, mu=np.array([[0,0,1]]), kappa=[1.0], prob=[1.0]):
    '''
    Randomly sampling data points from a mixture of q-dimensional von-Mises 
    Fisher (vMF) densities.
    
    Parameters
    ---------
        n: int
            The number of sampling random data points.
    
        mu: (m,d)-array
            The Euclidean coordinates of the m mean directions for a mixture of 
            vMF densities.
       
        kappa: list of floats
            The concentration parameters for the mixture of von-Mises Fisher 
            densities.
       
        prob: list of floats
            The mixture probabilities.
            
    Return
    --------
        data_ps: (n,d)-array
            The Euclidean coordinates of the randomly sampled points from the 
            vMF mixture.
    '''
    m = len(prob)   ## The number of mixtures
    d = mu.shape[1]  ## Euclidean dimension of the data
    assert (len(kappa) == len(prob)), "The parameters 'kappa' and 'prob' should be of the same length."
    inds = np.random.choice(list(range(m)), n, replace=True, p=np.array(prob)/sum(prob))
    data_ps = np.zeros((n,d))
    for i in range(m):
        data_ps[inds == i,:] = vMFSamp(sum(inds == i), mu=mu[i,:], kappa=kappa[i])
    return data_ps


[docs]def vMFGaussMixture(n, q=2, D=2, mu_vMF=np.array([[0,0,1]]), kappa=[1.0], 
                    mu_N=np.array([[1,1]]), cov=np.diag([1,1]).reshape(2,2,1), 
                    prob=[1.0]):
    '''
    Randomly sampling data points from a mixture of q-dimensional von-Mises Fisher 
    and D-dimensional Gaussian distributions (directional-linear mixture model).
    
    Parameters
    -----------
        n: int
            The number of sampling random data points.
            
        q: int
            Intrinsic data dimension of directional components.
            
        D: int
            Data dimension of linear components.
    
        mu_vMF: a (m,q+1)-array
            Euclidean coordinates of the m mean directions for the mixture of 
            von-Mises Fisher densities.
       
        kappa: list of floats
            The m concentration parameters for the mixture of von-Mises Fisher 
            densities.
            
        mu_N: (m,D)-array
            The means of the Gaussian mixture model with m components. 
       
        cov: (D,D,m)-array
            The (D,D)-covariance matrices of the Gaussian mixture model with 
            m components.
       
        prob: list of floats
            The m mixture probabilities.
            
    Return
    -------
        data_ps: (n, q+1+D)-array
            Euclidean coordinates of the randomly sampled points from the 
            vMF-Gaussian mixtures.
    '''
    m = len(prob)   ## The number of mixtures
    
    assert (len(kappa) == len(prob)), "The parameters 'kappa' and 'prob' "\
    "should be of the same length."
    assert (cov.shape[2] == len(prob)), "'cov.shape[2]' and 'len(prob)' "\
    "should be equal."
    
    inds = np.random.choice(list(range(m)), n, replace=True, 
                            p=np.array(prob)/sum(prob))
    data_ps = np.zeros((n,q+1+D))
    for i in range(m):
        data_ps[inds == i,:(q+1)] = vMFSamp(sum(inds == i), mu=mu_vMF[i,:], 
                                             kappa=kappa[i])
        data_ps[inds == i,(q+1):(q+1+D)] \
            = np.random.multivariate_normal(mu_N[i,:], cov[:,:,i], size=sum(inds == i))
    return data_ps


[docs]def SmoothBootstrap_vMF(data, B=1000, h=None):
    '''
    Resampling a dataset using the smoothed bootstrap with the von Mises kernel.
    
    Parameters
    ---------
        data: (n,d)-array
            The Euclidean coordinates of n directional data points in 
            the d-dimensional Euclidean space, where d=q+1.
            
        B: int
            The number of bootstrapping times.
            
        h: float
           The bandwidth parameter. (Default: h=None. Then a rule of thumb for 
           directional KDEs with the von Mises kernel in Garcia-Portugues (2013)
           is applied.)
            
    Return
    --------
        data_Boot: (n,d)-array
            The Euclidean coordinates of the smoothed bootstrap dataset.
    '''
    n = data.shape[0]  ## Number of data points
    d = data.shape[1]  ## Euclidean Dimension of the data

    ## Rule of thumb for directional KDE
    if h is None:
        R_bar = np.sqrt(sum(np.mean(data, axis=0) ** 2))
        ## An approximation to kappa (Banerjee 2005 & Sra, 2011)
        kap_hat = R_bar * (D - R_bar ** 2) / (1 - R_bar ** 2)
        if D == 3:
            h = (8*np.sinh(kap_hat)**2/(n*kap_hat * \
                 ((1+4*kap_hat**2)*np.sinh(2*kap_hat) - 2*kap_hat*np.cosh(2*kap_hat))))**(1/6)
        else:
            h = ((4 * np.sqrt(np.pi) * sp.iv(D / 2 - 1, kap_hat)**2) / \
                 (n * kap_hat ** (D / 2) * (2 * (D - 1) * sp.iv(D/2, 2*kap_hat) + \
                                  (D+1) * kap_hat * sp.iv(D/2+1, 2*kap_hat)))) ** (1/(D + 3))
        print("The current bandwidth is " + str(h) + ".\n")
        
    data_Boot = np.zeros((B, d))
    for i in range(B):
        ind = np.random.choice(n, size=1, replace=True)
        data_Boot[i,:] = vMFSamp(1, mu=data[ind[0],:], kappa=1/(h**2))
    return data_Boot