NixOps + GCE deployment errors

#1

Hey all,

I’m running into a frustrating time trying to get a deployment out on GCE. I’m trying to test some changes to NixOps that allow for private IP usage for SSH. I don’t think these changes are the cause of my GCE issues, but I am using a custom nixops, so just in case, here’s the git diff:

diff --git a/nix/gce.nix b/nix/gce.nix
index d106ce0..53d3144 100644
--- a/nix/gce.nix
+++ b/nix/gce.nix
@@ -278,6 +278,14 @@ in
         '';
       };
 
+      associatePublicIpAddress = mkOption {
+        default = true;
+        type = types.bool;
+        description = ''
+          Whether or not to associate a public IP address with this node.
+        '';
+      };
+
       network = mkOption {
         default = null;
         example = "resources.gceNetworks.verySecureNetwork";
diff --git a/nixops/backends/gce.py b/nixops/backends/gce.py
index fdcdb27..725ccab 100644
--- a/nixops/backends/gce.py
+++ b/nixops/backends/gce.py
@@ -37,6 +37,7 @@ class GCEDefinition(MachineDefinition, ResourceDefinition):
         self.copy_option(x, 'project', str)
         self.copy_option(x, 'serviceAccount', str)
         self.copy_option(x, 'canIpForward', bool, optional=True)
+        self.copy_option(x, 'associatePublicIpAddress', bool, optional=True)
         self.access_key_path = self.get_option_value(x, 'accessKey', str)
 
         self.copy_option(x, 'tags', 'strlist')
@@ -104,6 +105,7 @@ class GCEState(MachineState, ResourceState):
         return "gce"
 
     machine_name = attr_property("gce.name", None)
+    associate_public_ip_address = attr_property("gce.associatePublicIpAddress", True)
     public_ipv4 = attr_property("publicIpv4", None)
     private_ipv4 = attr_property("privateIpv4", None)
 
@@ -158,7 +160,7 @@ class GCEState(MachineState, ResourceState):
 
     def address_to(self, resource):
         """Return the IP address to be used to access "resource" from this machine."""
-        if isinstance(resource, GCEState) and resource.network == self.network:
+        if isinstance(resource, GCEState) and (not resource.associate_public_ip_address or resource.network == self.network):
             return resource.private_ipv4
         else:
             return MachineState.address_to(self, resource)
@@ -209,7 +211,8 @@ class GCEState(MachineState, ResourceState):
 
     defn_properties = ['tags', 'region', 'instance_type',
                        'email', 'scopes', 'subnet', 'preemptible',
-                       'metadata', 'ipAddress', 'network']
+                       'metadata', 'ipAddress', 'network',
+                       'associatePublicIpAddress']
 
     def is_deployed(self):
         return (self.vm_id or self.block_device_mapping)
@@ -427,7 +430,7 @@ class GCEState(MachineState, ResourceState):
                                  location = self.connect().ex_get_zone(defn.region),
                                  ex_boot_disk = self.connect().ex_get_volume(boot_disk['disk_name'] or boot_disk['disk'], boot_disk.get('region', None)),
                                  ex_metadata = self.full_metadata(defn.metadata), ex_tags = defn.tags, ex_service_accounts = service_accounts,
-                                 external_ip = (self.connect().ex_get_address(defn.ipAddress) if defn.ipAddress else 'ephemeral'),
+                                 external_ip = (None if not defn.associatePublicIpAddress else self.connect().ex_get_address(defn.ipAddress) if defn.ipAddress else 'ephemeral'),
                                  ex_can_ip_forward = defn.can_ip_forward,
                                  # in theory the API accepts creating an
                                  # instance by specifying only the subnet
@@ -444,9 +447,12 @@ class GCEState(MachineState, ResourceState):
             self.ssh_pinged = False
             self.copy_properties(defn)
             self.public_ipv4 = node.public_ips[0]
-            self.log("got public IP: {0}".format(self.public_ipv4))
+            if self.associate_public_ip_address:
+                self.log("got public IP: {0}".format(self.public_ipv4))
             known_hosts.add(self.public_ipv4, self.public_host_key)
             self.private_ipv4 = node.private_ips[0]
+            if not self.associate_public_ip_address:
+                self.log("got private IP: {0}".format(self.private_ipv4))
             for k,v in self.block_device_mapping.iteritems():
                 v['needsAttach'] = True
                 self.update_block_device_mapping(k, v)
@@ -932,9 +938,14 @@ class GCEState(MachineState, ResourceState):
         return keys
 
     def get_ssh_name(self):
-        if not self.public_ipv4:
-            raise Exception("{0} does not have a public IPv4 address (yet)".format(self.full_name))
-        return self.public_ipv4
+        if self.associate_public_ip_address:
+            if not self.public_ipv4:
+                raise Exception("{0} does not have a public IPv4 address (yet)".format(self.full_name))
+            return self.public_ipv4
+        else:
+            if not self.private_ipv4:
+                raise Exception("{0} does not have a private IPv4 address (yet)".format(self.full_name))
+            return self.private_ipv4
 
     def get_ssh_private_key_file(self):
         return self._ssh_private_key_file or self.write_ssh_private_key(self.private_client_key)

Here’s the background info on my deployment process and steps I took to generate a debug file:

I created an account on GCE
, created a project: foo
, created a service account: bar@foo-000000.iam.gserviceaccount.com
, generated/saved a .json key for this account: foo-000000-b14hb14hb14h.json
, added the ‘Storage Object Admin’ Role to the service account
, ran nix-env -i google-cloud-sdk && readlink -f ./foo-000000-b14hb14hb14h.json | gsutil config -e
, git clone-ed nixpkgs, checked out the 18.09 tagged commit hash, and ran ./nixos/maintainers/scripts/gce/create-gce.sh
, ran nixops create -d g logical.nix physical.nix && nixops deploy -d g --debug 2> debugoutput

Sparing everyone the xml output (tho willing to paste sections on request), the debugoutput file is as follows:

lb-net.............> creating GCE network 'n-b6f433eb524c11e98f6d00155dbfa307-lb-net'...
plain-hc...........> creating GCE HTTP health check 'n-b6f433eb524c11e98f6d00155dbfa307-plain-hc'...
mwilsoncoding-nixos> creating GCE image 'mwilsoncoding-nixos'...
bootstrap..........> creating GCE image 'n-b6f433eb524c11e98f6d00155dbfa307-bootstrap'...
error: Multiple exceptions (4): 
  * bootstrap: {'domain': 'usageLimits', 'message': 'Access Not Configured. Compute Engine API has not been used in project 1078724259120 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/compute.googleapis.com/overview?project=1078724259120 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.', 'reason': 'accessNotConfigured', 'extendedHelp': 'https://console.developers.google.com/apis/api/compute.googleapis.com/overview?project=1078724259120'}
  * lb-net: {'domain': 'usageLimits', 'message': 'Access Not Configured. Compute Engine API has not been used in project 1078724259120 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/compute.googleapis.com/overview?project=1078724259120 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.', 'reason': 'accessNotConfigured', 'extendedHelp': 'https://console.developers.google.com/apis/api/compute.googleapis.com/overview?project=1078724259120'}
  * mwilsoncoding-nixos: {'domain': 'usageLimits', 'message': 'Access Not Configured. Compute Engine API has not been used in project 1078724259120 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/compute.googleapis.com/overview?project=1078724259120 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.', 'reason': 'accessNotConfigured', 'extendedHelp': 'https://console.developers.google.com/apis/api/compute.googleapis.com/overview?project=1078724259120'}
  * plain-hc: {'domain': 'usageLimits', 'message': 'Access Not Configured. Compute Engine API has not been used in project 1078724259120 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/compute.googleapis.com/overview?project=1078724259120 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.', 'reason': 'accessNotConfigured', 'extendedHelp': 'https://console.developers.google.com/apis/api/compute.googleapis.com/overview?project=1078724259120'}

------------------------------
Traceback (most recent call last):
Traceback (most recent call last):
  File "/nix/store/igwmf6kfsna6rpywdz7qf55jxyv0imkx-nixops-1.6.1pre0_abcdef/bin/.nixops-wrapped", line 1000, in <module>
    e.print_all_backtraces()
  File "/nix/store/igwmf6kfsna6rpywdz7qf55jxyv0imkx-nixops-1.6.1pre0_abcdef/lib/python2.7/site-packages/nixops/parallel.py", line 20, in print_all_backtraces
    traceback.print_exception(e[0], e[1], e[2])
  File "/nix/store/2smkcirh3snjk71ia9s3rmrkfzr1giia-python-2.7.15/lib/python2.7/traceback.py", line 125, in print_exception
    print_tb(tb, limit, file)
  File "/nix/store/2smkcirh3snjk71ia9s3rmrkfzr1giia-python-2.7.15/lib/python2.7/traceback.py", line 61, in print_tb
    f = tb.tb_frame
AttributeError: 'unicode' object has no attribute 'tb_frame'

And if it’s of any use, here is the logical.nix (I’m experimenting with mkReplicas, so forgive me if this looks a bit weird):

let
  mkReplicas =
    build_specification: name_prefix: quantity:
    builtins.listToAttrs (
      builtins.genList (
        count: 
        { "name" = "${name_prefix}${builtins.toString count}"; 
          "value" = build_specification;
        }
      ) quantity
    );

  httpdlb =
    { config, pkgs, ... }:
    { services.httpd = {
        extraModules = ["proxy_balancer" "lbmethod_byrequests"];
        extraConfig = ''
          <Proxy balancer://cluster>
            Allow from all
            BalancerMember http://webserver0 retry=0
            BalancerMember http://webserver1 retry=0
          </Proxy>
          ProxyPass         /    balancer://cluster/
          ProxyPassReverse  /    balancer://cluster/
        '';
      };
    };

  staticwebserver =
    { config, pkgs, ... }:
    { services.httpd = {
        documentRoot = "${pkgs.valgrind.doc}/share/doc/valgrind/html";
      };
    };
in
{
  network = {
    description = "Load balanced static web server";
  };

  defaults = {
    imports = [ ./firewallcommon.nix ./httpdcommon.nix ];
  };
  
} //
mkReplicas httpdlb "loadbalancer" 1 //
mkReplicas staticwebserver "webserver" 2

And the physical.nix:

let
  credentials = {
    project = "foo";
    serviceAccount = "bar@foo-000000.iam.gserviceaccount.com";
    accessKey = "/home/mwilson/nixtalks/gce/foo-000000-b14hb14hb14h.json";
  };

  gcevm =
    { resources, ... }:
    {
      deployment = {
        targetEnv = "gce";
        gce = credentials // {
          region = "us-east4";
          bootstrapImage = resources.gceImages.mwilsoncoding-nixos;
          tags = [ "public-http" ];
          network = resources.gceNetworks.lb-net;
        };
      };
    };
in
  {
    resources = {
      gceNetworks.lb-net = credentials // {
        addressRange = "192.168.4.0/24";
        firewall = {
          allow-http = {
            targetTags = [ "public-http" ];
            allowed.tcp = [ 80 ];
          };
          allow-ping.allowed.icmp = null;
        };
      };

      gceHTTPHealthChecks.plain-hc = credentials;

      gceTargetPools.backends =
        { resources, nodes, ... }:
        credentials // {
          region = "us-east4";
          healthCheck = resources.gceHTTPHealthChecks.plain-hc;
          machines = with nodes; [ webserver0 webserver1 ];
        };

      gceForwardingRules.lb =
        { resources, ... }:
        credentials // {
          protocol = "TCP";
          region = "us-east4";
          portRange = "80";
          targetPool = resources.gceTargetPools.backends;
          description = "Alternative HTTP Load Balancer";
        };

      gceImages.mwilsoncoding-nixos = credentials // {
        name = "mwilsoncoding-nixos";
        sourceUri = "gs://mwilsoncoding-nixos/nixos-image-18.09.2317.18fec2687c0-x86_64-linux.raw.tar.gz";
      };
    };

    loadbalancer0 = gcevm;

    webserver0 = gcevm;
    webserver1 = gcevm;
  }

Any ideas on how to get this to deploy correctly?

#2

Many thanks to amine-chikhaoui for the assist in IRC!

If it helps anyone else with any random thing in the future, here’s the main issue:

In physical.nix, project = "foo"; should instead be project = "foo-000000";
The first is my project’s name, where the second is my project’s ID.

Also this bit in the git diff:

should instead be:
external_ip = (None if not defn.associate_public_ip_address else ...
because GCEDefinition::__init__ copies the associatePublicIpAddress option into defn as an attribute- renaming it like so: re.sub(r'([a-z])([A-Z])',r'\1_\2', name).lower()

Making that change got it all compiling and deploying correctly (when associatePublicIpAddress is set to true- I’m still working on the false bit (index out of range exception instead of the expected failure: an exception stating the machines could not be reached via ssh (cuz I’m deploying from my work laptop, which is not local to GCE))).