Code Reproducibility

Code Reproducibility#

import numpy as np
from graphbook_code import heatmap

def generate_unit_circle(radius):
    diameter = 2*radius + 1
    rx = ry = diameter/2
    x, y = np.indices((diameter, diameter))

    circle_dist = np.hypot(rx - x, ry - y)
    diff_from_radius = np.abs(circle_dist - radius)
    less_than_half = diff_from_radius < 0.5

    return less_than_half.astype(int)

def add_smile():
    canvas = np.zeros((51, 51))
    canvas[2:45, 2:45] = generate_unit_circle(21)
    mask = np.zeros((51, 51), dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    upper_left = np.rot90(mask)
    canvas[upper_left] = 0
    return canvas
    
def smile_probability(upper_p, lower_p):
    smiley = add_smile()
    P = generate_unit_circle(25)
    P[5:16, 25:36] = generate_unit_circle(5)
    P[smiley != 0] = smiley[smiley != 0]
    
    mask = np.zeros((51, 51), dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    P[~mask] = 0
    # symmetrize the probability matrix
    P = (P + P.T - np.diag(np.diag(P))).astype(float)
    P[P == 1] = lower_p
    P[P == 0] = upper_p
    return P

P = smile_probability(.95, 0.05)
heatmap(P, vmin=0, vmax=1, title="Probability matrix $P$")
<Axes: title={'left': 'Probability matrix $P$'}>
../../_images/61288f5e0d36c5b3c2087ab8d8679003b869e7a70bd06b28391eb24a77da691a.png
from graspologic.simulations import sample_edges

A = sample_edges(P, directed=False, loops=False)
heatmap(A.astype(int), title="$IER_n(P)$ sample")
<Axes: title={'left': '$IER_n(P)$ sample'}>
../../_images/9f2f194d7bf420140760ca075ce044610e86da716ebbd33048d63f2477093c86.png
import numpy as np
from math import comb

node_count = np.arange(2, 51)
log_unique_network_count = np.array([comb(n, 2) for n in node_count])*np.log10(2)
from graphbook_code import draw_multiplot
from graspologic.simulations import er_np

n = 50  # network with 50 nodes
p = 0.3  # probability of an edge existing is .3

# sample a single simple adjacency matrix from ER(50, .3)
A = er_np(n=n, p=p, directed=False, loops=False)

# and plot it
draw_multiplot(A.astype(int), title="$ER_{50}(0.3)$ Simulation")
array([<Axes: >, <Axes: >], dtype=object)
../../_images/3efe916fd4d3559f4936ed23ec26a7610bf077565d1d762e5d03c6107dbf1745.png
p = 0.7  # network has an edge probability of 0.7

# sample a single adjacency matrix from ER(50, 0.7)
A = er_np(n=n, p=p, directed=False, loops=False)
from graphbook_code import plot_vector
import numpy as np

n = 100  # number of students

# z is a column vector of 50 1s followed by 50 2s
# this vector gives the school each of the 100 students are from
z = np.repeat([1, 2], repeats=n//2)
plot_vector(z, title="$\\vec z$, Node Assignment Vector",
            legend_title="School", color="qualitative", 
            ticks=[0.5, 49.5, 99.5], ticklabels=[1, 50, 100],
            ticktitle="Student")
<Axes: title={'left': '$\\vec z$, Node Assignment Vector'}, ylabel='Student'>
../../_images/fe8a945dd8d34eb685d1bed73a8e2e90105a8e5c31889faee46afb6fb0d9d287.png
from graphbook_code import heatmap

K = 2  # community count
# construct the block matrix B as described above
B = np.array([[0.6, 0.1], 
              [0.1, 0.4]])

heatmap(B, xticklabels=[1, 2], yticklabels=[1,2], vmin=0, 
             vmax=1, annot=True, xtitle="School",
             ytitle="School", title="Block Matrix $B$")
<Axes: title={'left': 'Block Matrix $B$'}, xlabel='School', ylabel='School'>
../../_images/d45cd38043617c2bce0cebf103f1f9d4de62f8c6d9bc610e433ba2048cb800d0.png
from graspologic.simulations import sbm
from graphbook_code import draw_multiplot

# sample a graph from SBM_{100}(tau, B)
A, labels = sbm(n=[n//2, n//2], p=B, directed=False, loops=False, return_labels=True)
draw_multiplot(A, labels=labels, title="$SBM_n(z, B)$ Simulation");
../../_images/6a6ec4b8a7c8da2b655a2aa5a6f7110ee12591c681a322e5c5da539726d28c98.png
import numpy as np

# generate a reordering of the n nodes
permutation = np.random.choice(n, size=n, replace=False)

Aperm = A[permutation][:,permutation]
yperm = labels[permutation]
heatmap(Aperm, title="Nodes randomly reordered")
<Axes: title={'left': 'Nodes randomly reordered'}>
../../_images/cf37a784649e533d32d88bf0420aec7aab9c1fc0c9f4f8086ec45d8d4dfcc154.png
def ohe_comm_vec(z):
    """
    A function to generate the one-hot-encoded community
    assignment matrix from a community assignment vector.
    """
    K = len(np.unique(z))
    n = len(z)
    C = np.zeros((n, K))
    for i, zi in enumerate(z):
        C[i, zi - 1] = 1
    return C
import numpy as np
from graphbook_code import lpm_heatmap

n = 100  # the number of nodes in our network
# design the latent position matrix X according to 
# the rules we laid out previously
X = np.zeros((n,2))
for i in range(0, n):
    X[i,:] = [(n - i)/n, i/n]

lpm_heatmap(X, ytitle="Person", xticks=[0.5, 1.5], xticklabels=[1, 2], 
            yticks=[0.5, 49.5, 99.5], yticklabels=[1, 50, 100],
            xtitle="Latent Dimension", title="Latent Position Matrix, X")
<Axes: title={'left': 'Latent Position Matrix, X'}, xlabel='Latent Dimension', ylabel='Person'>
../../_images/f3248d563d246ad7ef9d1a042216b70a52c784b4b588e834005a007e944cc6a1.png
from graspologic.simulations import rdpg
from graphbook_code import heatmap

# sample an RDPG with the latent position matrix
# created above
A = rdpg(X, loops=False, directed=False)

# and plot it
heatmap(A.astype(int), xtitle="Person", ytitle="Person",
        title="$RDPG_{100}(X)$ Simulation")
<Axes: title={'left': '$RDPG_{100}(X)$ Simulation'}, xlabel='Person', ylabel='Person'>
../../_images/ce635e07116a0d8c941ae07d3c7c2e778f531063cf33f5ddddefcdcbf84692d8.png
import numpy as np

def block_mtx_psd(B):
    """
    A function which indicates whether a matrix
    B is positive semidefinite.
    """
    return np.all(np.linalg.eigvals(B) >= 0)
import numpy as np
from graphbook_code import heatmap

B = np.array([[0.6, 0.2], 
              [0.2, 0.4]])
heatmap(B, title="A homophilic block matrix", annot=True, vmin=0, vmax=1)
block_mtx_psd(B)
# True
True
../../_images/1bf7096dbbc7f2b668f63dfa122bc7f1ea3e3005daebec51737443b7de47fd8b.png
B_indef = np.array([[.1, .2], 
                    [.2, .1]])
block_mtx_psd(B_indef)
# False
False
# a positive semidefinite kidney-egg block matrix
B_psd = np.array([[.6, .2], 
                  [.2, .2]])
block_mtx_psd(B_psd)
# True

# an indefinite kidney-egg block matrix
B_indef = np.array([[.1, .2], 
                    [.2, .2]])
block_mtx_psd(B_indef)
#False
False
# a positive semidefinite core-periphery block matrix
B_psd = np.array([[.6, .2], 
                  [.2, .1]])
block_mtx_psd(B_psd)
# True

# an indefinite core-periphery block matrix
B_indef = np.array([[.6, .2], 
                    [.2, .05]])
block_mtx_psd(B_indef)
# False
False
# an indefinite disassortative block matrix
B = np.array([[.1, .5], 
              [.5, .2]])
block_mtx_psd(B)
# False
False
# homophilic, and hence positive semidefinite, block matrix
B = np.array([[0.6, 0.2], 
              [0.2, 0.4]])

# generate square root matrix
sqrtB = np.linalg.cholesky(B)

# verify that the process worked through by equality element-wise
# use allclose instead of array_equal because of tiny
# numerical precision errors
np.allclose(sqrtB @ sqrtB.T, B)
# True
True
from graphbook_code import ohe_comm_vec

def lpm_from_sbm(z, B):
    """
    A function to produce a latent position matrix from a
    community assignment vector and a block matrix.
    """
    if not block_mtx_psd(B):
        raise ValueError("Latent position matrices require PSD block matrices!")
    # one-hot encode the community assignment vector
    C = ohe_comm_vec(z)
    # compute square root matrix
    sqrtB = np.linalg.cholesky(B)
    # X = C*sqrt(B)
    return C @ sqrtB

# make a community assignment vector for 25 nodes / community
nk = 25
z = np.repeat([1, 2], nk)

# latent position matrix for an equivalent RDPG
X = lpm_from_sbm(z, B)
from graphbook_code import generate_sbm_pmtx

# generate the probability matrices for an RDPG using X and SBM
P_rdpg = X @ X.T
P_sbm = generate_sbm_pmtx(z, B)

# verify equality element-wise
np.allclose(P_rdpg, P_sbm)
# True
True
import numpy as np
from graspologic.simulations import sample_edges
from graphbook_code import heatmap, plot_vector, \
    generate_sbm_pmtx

def dcsbm(z, theta, B, directed=False, loops=False, return_prob=False):
    """
    A function to sample a DCSBM.
    """
    # uncorrected probability matrix
    Pp = generate_sbm_pmtx(z, B)
    theta = theta.reshape(-1)
    # apply the degree correction
    Theta = np.diag(theta)
    P = Theta @ Pp @ Theta.transpose()
    network = sample_edges(P, directed=directed, loops=loops)
    if return_prob:
        network = (network, P)
    return network
# Observe a network from a DCSBM
nk = 50  # students per school
z = np.repeat([1, 2], 50)
B = np.array([[0.6, 0.2], [0.2, 0.4]])  # same probabilities as from SBM section
theta = np.tile(np.linspace(1, 0.5, nk), 2)
A, P = dcsbm(z, theta, B, return_prob=True)

# Visualize
plot_vector(z, title="$\\vec z$", legend_title="School", color="qualitative", 
            ticks=[0.5, 49.5, 99.5], ticklabels=[1, 50, 100],
            ticktitle="Student")
plot_vector(theta, title="$\\vec \\theta$", 
            legend_title="Degree-Correction Factor", 
            ticks=[0.5, 49.5, 99.5], ticklabels=[1, 50, 100],
            ticktitle="Student")
heatmap(P, title="$P = \\Theta C B C^\\top \\Theta^\\top$", vmin=0, vmax=1)
heatmap(A.astype(int), title="Sample of $DCSBM_n(\\vec z, \\vec \\theta, B)$")
<Axes: title={'left': 'Sample of $DCSBM_n(\\vec z, \\vec \\theta, B)$'}>
../../_images/2351bace32aabdc0734b2d90808ea4646f52e3256a11bfff9f209bfb84931338.png ../../_images/7e3a8e073560ae2568dc6d54841d16041267f2c5f5b272e0156f0bb55fd226f3.png ../../_images/af95731c94f69ca006b20ec75911b6b23a413e20580dce36b4346039a18f4622.png ../../_images/2dbfabfce0f529e76bc707dc5ffd579095369b0c5b6bd4385f27bb6a90fb9b59.png
import numpy as np

n = 100
Z = np.ones((n, n))
for i in range(0, int(n / 2)):
    Z[int(i + n / 2), i] = 3
    Z[i, int(i + n / 2)] = 3
Z[0:50, 0:50] = Z[50:100, 50:100] = 2
np.fill_diagonal(Z, 0)
from graphbook_code import heatmap

labels = np.repeat(["L", "R"], repeats=n/2)
heatmap(Z.astype(int), title="Cluster assignment matrix", 
        inner_hier_labels=labels)
<Axes: title={'left': 'Cluster assignment matrix'}>
../../_images/19e737a945f3ab176210c48b02eeac06e1a564e274b737e98eb93986130f10d0.png
from graphbook_code import siem, plot_vector

p = np.array([0.1, 0.3, 0.8])
A = siem(n, p, Z)
plot_vector(p, title="probability vector", vmin=0, vmax=1, annot=True)
heatmap(A.astype(int), title="$SIEM_n(Z, \\vec p)$ sample", 
        inner_hier_labels=labels)
<Axes: title={'left': '$SIEM_n(Z, \\vec p)$ sample'}>
../../_images/f960115c3ddc08ee991d5eda2a6421d9cc60aff9c31384c3a29ec51a7be7c33a.png ../../_images/89d39fd7a6adca1dabee23c37545c50a639420916ecd341c7389b1a81dc3f0ee.png
from graspologic.simulations import sbm
import numpy as np
from graphbook_code import dcsbm
from sklearn.preprocessing import LabelEncoder

# Create block probability matrix B
K = 3
B = np.full(shape=(K, K), fill_value=0.15)
np.fill_diagonal(B, 0.4)

# degree-correct the different groups for linkedin
ml, admin, marketing = nks = [50, 25, 25]
theta = np.ones((np.sum(nks), 1))
theta[(ml):(ml + admin), :] = np.sqrt(2)

# our dcsbm function only works with communities encoded 1,2,...,K
# so we'll use a LabelEncoder to map labels to natural numbers
labels = np.repeat(["ML", "AD", "MA"], nks)
le = LabelEncoder().fit(labels)
z = le.transform(labels) + 1

# sample the random networks
A_facebook = sbm(n=nks, p=B)
A_insta = sbm(n=nks, p=B)
A_linkedin, P_linkedin = dcsbm(z, theta, B, return_prob=True)
from graphbook_code import generate_sbm_pmtx, heatmap

# we already returned P_linkedin for the linkedin
# probability matrix from dcsbm() function
P_facebook_insta = generate_sbm_pmtx(z, B)

# when plotting for comparison purposes, make sure you are
# using the same scale from 0 to 1
heatmap(P_facebook_insta, vmin=0, vmax=1)
heatmap(P_linkedin, vmin=0, vmax=1)
heatmap(P_linkedin - P_facebook_insta, vmin=0, vmax=1)
<Axes: >
../../_images/80d613e4a935c84278b27500d33ed4dc8468b16b4837bc9626612b7e14a4844b.png ../../_images/62d1c0bbb01ed29f6adb076a78305b2081e95aca11a06d90af5eb2b140aabf7f.png ../../_images/e9625e4f900681dddbb7bfda09c6edfc706afdc6531de1dae3da691f6f30dbef.png
from graspologic.embed import MultipleASE as mase
from graphbook_code import lpm_heatmap

embedder = mase(n_components=3, svd_seed=0)
# obtain shared latent positions
S = embedder.fit_transform([P_facebook_insta, P_facebook_insta, P_linkedin])

lpm_heatmap(S)
<Axes: >
../../_images/430afa9be28e5760e0f807b9cbb445fa01515d2e15e0b4d872d271d55f9ff6b7.png
import matplotlib.pyplot as plt

R_facebook = embedder.scores_[0]
R_insta = embedder.scores_[1]
R_linkedin = embedder.scores_[2]

# and plot them
smin = np.min(embedder.scores_)
smax = np.max(embedder.scores_)

fig, axs = plt.subplots(1, 3, figsize=(20, 7))
heatmap(R_facebook, vmin=smin, vmax=smax, ax=axs[0], annot=True, title="facebook score matrix")
heatmap(R_insta, vmin=smin, vmax=smax, ax=axs[1], annot=True, title="Instagram score matrix")
heatmap(R_linkedin, vmin=smin, vmax=smax, ax=axs[2], annot=True, title="LinkedIn score matrix")
<Axes: title={'left': 'LinkedIn score matrix'}>
../../_images/d150176eb0ca88aefc58ba46cfdca13457956993f43522a9293d0cc011867491.png
from graphbook_code import lpm_from_sbm
X_facebook_insta = lpm_from_sbm(z, B)
from graspologic.simulations import rdpg_corr

# generate the network samples
rho = 0.7
facebook_correlated_network, insta_correlated_network = rdpg_corr(
    X_facebook_insta, Y=None, r=rho
)

# the difference matrix
correlated_difference_matrix = np.abs(
    facebook_correlated_network - insta_correlated_network
)
# the total number of differences
correlated_differences = correlated_difference_matrix.sum()
rho_nil = 0.0
facebook_uncorrelated_network, insta_uncorrelated_network = rdpg_corr(
    X_facebook_insta, Y=None, r=rho_nil
)

# the difference matrix
uncorrelated_difference_matrix = np.abs(
    facebook_uncorrelated_network - insta_uncorrelated_network
)
# the total number of differences
uncorrelated_differences = uncorrelated_difference_matrix.sum()
import numpy as np
from graspologic.simulations import sample_edges

nodenames = [
    "SI", "L", "H/E", 
    "T/M", "BS"
]

# generate probability matrices
n = 5  # the number of nodes
P_earthling = 0.3*np.ones((n, n))
signal_subnetwork = np.zeros((n, n), dtype=bool)
signal_subnetwork[1:n, 0] = True
signal_subnetwork[0, 1:n] = True
P_astronaut = np.copy(P_earthling)
P_astronaut[signal_subnetwork] = np.tile(np.linspace(0.4, 0.9, num=4), 2)

# sample two networks
A_earthling = sample_edges(P_earthling)
A_astronaut = sample_edges(P_astronaut)
# plot probability matrices and their differences on the same scale
heatmap(P_earthling, vmin=0, vmax=1)
heatmap(P_astronaut, vmin=0, vmax=1)
heatmap(np.abs(P_astronaut - P_earthling), vmin=0, vmax=1)
<Axes: >
../../_images/56a4ebdc88f49ff2dfaf9489f1e113faa6ed09f7080567e20cca1916ed750501.png ../../_images/edf478e641a6c01328768b7dd1c1b072d4ea2d0463234f47ea76c64952d990d6.png ../../_images/ce979a77c24bc1ae1bc1fcefd6aa42366f7fdc74c3f9b5e45c1f010bad8c3b79.png
# plot the signal subnetwork
ax = heatmap(signal_subnetwork)
../../_images/2a899ec7e469f31bdcb2f834ee49095a1b40b2ede32e278fabfa0431b4f58c5a.png
# sample the classes of each sample
M = 200  # the number of training and testing samples
pi_astronaut = 0.45
pi_earthling = 0.55
np.random.seed(0)
yvec = np.random.choice(2, p=[pi_earthling, pi_astronaut], size=M)

# sample network realizations given the class of each sample
Ps = [P_earthling, P_astronaut]

As = np.stack([sample_edges(Ps[y]) for y in yvec], axis=2)