---
title: "Categorical and continuous feature embedding"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Categorical and continuous feature embedding}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

# Necessary packages

```{r setup, results = 'hide', message=FALSE}
library(rsetse)
library(igraph)
library(dplyr)
library(ggraph)
```


#Plot the data

The biconnected_network is a simple network constructed of two groups of nodes that are linked by a single edge. This means the network is made of 3 bi-connected components hence the name. For more information type `?biconnected_network`
```{r message=FALSE}
biconnected_network %>%
  ggraph() + 
  geom_edge_link() +
  geom_node_point(aes(colour = group), size = 3) 
```


# Continuous embeddings

Creating the embeddings using this dataset is very simple as it already has a force variable and the edge weights can be used as the spring constant.
```{r message=FALSE}

embeddings_cont <- biconnected_network %>%
  prepare_edges(.) %>%
  prepare_continuous_force(., node_names = "name", force_var = "force") %>%
  setse_auto(., k = "weight")


out <- create_node_edge_df(embeddings_cont, function_names = c("mean", "mode", "sum"))

```

However, for some reason we may want to reset the spring constant to something else, below we perform the embedding using a fixed k of 500.

```{r message=FALSE}

embeddings_cont_fixed <- biconnected_network %>%
  prepare_edges(., k = 500) %>%
  prepare_continuous_force(., node_names = "name", force_var = "force") %>%
  setse_auto(., k = "k")

```

## plot the resulting embeddings

By aggregating the tension in each edge to node level using `create_node_edge_df()` for both the embeddings methods we can see how the different node are embedded. What we see is that the two most central node experience much more tension and also have a substantially higher elevation than the other nodes. This is expected as on the biconnected_network network the node force is the centrality of the nodes.

We can also see that the embeddings are similar but having fixed or variable k-strength has a clear impact on the final embeddings.

```{r message=FALSE}

continuous_results <- bind_rows(create_node_edge_df(embeddings_cont) %>% mutate(type = "variable k"),
          create_node_edge_df(embeddings_cont_fixed) %>% mutate(type = "fixed k")
 ) 

continuous_results %>% 
  ggplot(aes(x = tension_mean, y = elevation, colour = node)) + geom_jitter() +
  facet_wrap(~type)  +
  facet_wrap(~type) +
  labs(title = "Continuous embeddings",
       x = "mean tension")

```

# Categorical embeddings

Now we will use the group identity as binary force variable. The network is made up of two groups A and B. We arbitrarily set A to be the positive force.

As can be seen embeddings using the groups as force variables create very different embedded results, this is despite the fact that the networks are identical. 

For factor levels of more than two, the high-dimensional setse should be used.

```{r message=FALSE}

embeddings_binary <- biconnected_network %>%
  prepare_edges(.) %>%
  prepare_categorical_force(., node_names = "name", force_var = "group") %>%
  setse_auto(., 
             force = "group_A",
             k = "weight")

embeddings_binary_fixed <- biconnected_network %>%
  prepare_edges(., k = 500) %>%
  prepare_categorical_force(., node_names = "name", force_var = "group") %>%
  setse_auto(., 
             force = "group_A",
             k = "k")

binary_results <- bind_rows(create_node_edge_df(embeddings_binary) %>% mutate(type = "variable k"),
          create_node_edge_df(embeddings_binary_fixed) %>% mutate(type = "fixed k")
 ) 

binary_results %>% 
  ggplot(aes(x = tension_mean, y = elevation, colour = node)) + geom_jitter() +
  facet_wrap(~type) +
  labs(title = "Binary embeddings",
       x = "mean tension")

```

# Using both embeddings

Because this network has two features. We can embed this high dimensional network using the `setse_auto_hd` function.

What we see is that in this case there is very little relationship between the elevation of a node when it is embedded using different features.

```{r message=FALSE}


two_dimensional_embeddings <- biconnected_network %>%
  prepare_edges(.) %>%
  #prepare the continuous features as normal
  prepare_continuous_force(., node_names = "name", force_var = "force") %>%
  #prepare the categorical features as normal
  prepare_categorical_force(., node_names = "name", force_var = "group") %>%
  #embed them using the high dimensional function
  setse_auto_hd(., force = c("group_A", "force"), k = "weight")

two_dimensional_embeddings_fixed <- biconnected_network %>%
  prepare_edges(., k = 500) %>%
  #prepare the continuous features as normal
  prepare_continuous_force(., node_names = "name", force_var = "force") %>%
  #prepare the categorical features as normal
  prepare_categorical_force(., node_names = "name", force_var = "group") %>%
  #embed them using the high dimensional function
  setse_auto_hd(., force = c("group_A", "force"), k = "k")

bind_rows(two_dimensional_embeddings$node_embeddings %>% mutate(type = "variable k"),
two_dimensional_embeddings_fixed$node_embeddings %>% mutate(type = "fixed k")) %>%
  #The elevation variables are renamed for simplicity
  rename(categorical = elevation_group_A,
         continuous = elevation_force) %>%
  ggplot(aes(x = categorical, y = continuous, colour = node)) + geom_jitter() +
  facet_wrap(~type) +
  labs(title = "Node elevation for two different features",
       x = "elevation with continuous embedding",
       y = "elevation with categorical embedding")


```