Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
  1. configure connection to nectar server

  2. write and pack the user data

    1. write the user data in a .fcc file

      Code Block
      variant: fcos
      version: 1.4.0
      passwd:
        # set user and group for operate on NFS
        groups:
          - name: Q4646RW
            gid: 544646
        users:
          - name: core
            groups:
              - Q4646RW
              # core, adm, wheel, sudo, systemd-journal
      storage:
          # write content into file
          files:
            # disable auto updates by config
            - path: /etc/zincati/config.d/90-disable-auto-updates.toml
              contents:
                inline: |
                  [updates]
                  enabled = false
      systemd:
          units:
              # disabling zincati service only works in fcos 36-20220522.2.1 upwards
              - name: zincati.service
                enabled: false
              # mount NFS onto VM
              - name: var-mnt-data-Q4646.mount
                contents: |
                  [Unit]
                  Description = /var/mnt/data/Q4646
                  After=network-online.target
                  Wants=network-online.target
      
                  [Mount]
                  What = 10.255.122.28:/gpfs/general02/pool9200/Q4646/Q4646
                  Where = /var/mnt/data/Q4646
                  Type=nfs
                  Options= rw,hard,nfsvers=3,nosuid,nodev,noresvport
                  TimeoutSec=5
      
                  [Install]
                  WantedBy=multi-user.target
              - name: var-mnt-data-Q4646.automount
                enabled: true
                contents: |
                  [Unit]
                  Description=/var/mnt/data/Q4646
      
                  [Automount]
                  Where=/var/mnt/data/Q4646
                  TimeoutIdleSec=0
      
                  [Install]
                  WantedBy=multi-user.target
      
    2. generate ign user data file from fcc. docker installed required.

      Code Block
      languagebash
      # fcc file location: C:\Users\uqtlan\fcos.fcc
      docker run --rm -i -v C:\Users\uqtlan:/var/opt -w /var/opt \
      quay.io/coreos/fcct:v0.18.0 --pretty --strict --output fcos.ign fcos.fcc
  3. specify the parameters for the VM

    1. name: name of the instance. drone_pdal_translate

    2. port: network port suffix for mounting NFS.
      "-test(-prod)" for port NFSPortnfs-port-02-test(NFSPortnfs-port-02-prod)

    3. image: image for the instance.
      tern-fedora-core-38 for running podman directly

    4. nets: list of network names. ["qld"]

    5. security_groups: list of security group names. ["default", "ssh_public"]

    6. availability_zone: availability zone for the VM. QRISCloud

    7. key: name of the keypair for the vm. ansible

    8. flavor: name of the flavor for creating VM. r3.large

    9. user_data: read from ign file as string and base64-encode it

  4. create VM in python by openstacksdk (and push the instance id and ip to xcom)

    Code Block
    languagepy
    conn = openstack.connect(**auth)
    
    # user_data needs to be base64 encoded
    with open(user_data_file, 'r') as file:
        user_data_raw = file.read()
    user_data = base64.b64encode(user_data_raw.encode("utf-8")).decode('utf-8')
    
    # check whether security groups exist
    security_groups = []
    for security_group in security_groups_list:
        sg = conn.network.find_security_group(security_group)
        if sg:
            log.info('adding security group {}:{}'.format(sg.name,sg.id))
            security_groups.append({"name":sg.name})
        else:
            raise Exception('security group {} not found'.format(security_group))
    
    # check whether image exists
    im = conn.image.find_image(image)
    if im:
        log.info('using image {}:{}'.format(image,im.id))
    else:
        raise Exception('image {} not found'.format(image))
    
    # check whether flavor exists
    fl = conn.compute.find_flavor(flavor)
    if fl:
        log.info('using flavor {}:{}'.format(flavor,fl.id))
    else:
        raise Exception('flavor {} not found'.format(flavor))
    
    # find an available port
    port = None
    for num in range(1,6):
        suf = port_suf
        _port = conn.network.find_port(f"nfs-port-{num:02}{suf}")
        if _port and not _port.device_id:
            port = _port
            print(f'using port {port.name}:{port.id}')
            break
    if not port:
        log.warn("no available port for NFS. using direct mounting instead, which might cause error.")
    
    # nics = [{'uuid': some_net.id},{'port': port.id}]
    nics = []
    if port:
        log.info('appending port {}:{}'.format(port.name,port.id))
        nics.append({'port': port.id})
    for net_name in nets:
        # check whether network exists
        net = conn.network.find_network(net_name)
        if net:
            log.info('appending network {}:{}'.format(net_name,net.id))
            nics.append({'uuid': net.id})
        else:
            log.warn('net {} not found'.format(net))
    
    conn.compute.create_server(name = name,
                                image_id = im.id,
                                flavor_id = fl.id,
                                security_groups = security_groups,
                                availability_zone = availability_zone,
                                key_name = key,
                                networks = nics,
                                user_data = user_data
                                )
    
    log.info("Sleeping for 5s after create command")
    time.sleep(5)
    
    try:
        check_instance = conn.compute.find_server(name)
        while check_instance.status != 'ACTIVE' and check_instance.status != 'ERROR':
            log.info("Building instance... please wait...")
            time.sleep(30)
            check_instance = conn.compute.get_server(check_instance.id)
    except:
        raise Exception("Error building instance {}. Probably a nectar outage.".format(check_instance.name))
    
    if check_instance.status == 'ERROR':
        raise Exception("Error building instance {}. Please check OpenStack logs: {}".format(check_instance.name,check_instance.fault))
    
    log.info("VM running... status=" + conn.compute.get_server(check_instance.id).status)
    
    log.info("Instance {}:{} created and running".format(check_instance.name, check_instance.id))
    # wait a bit for VM building up
    time.sleep(30)
    # push instance id and ip address
    kwargs["task_instance"].xcom_push(key = "instance-id", value = check_instance.id)
    kwargs["task_instance"].xcom_push(
        key="private-ip", value = check_instance.addresses['qld'][0]["addr"]
    )

  5. upload taskscripts. skip this step if taskscript is written in user data

    1. grant write permission of the directory for uploading taskscripts.

      Code Block
      languagepy
      ssh_hook = SSHHook("ssh_ecoplots")
      ssh_hook.username = "core"
      grant_write_permission = SSHOperator(
          executor_config=executor_config,
          # The ID specified for the task.
          task_id="grant_write_permission",
          ssh_hook=ssh_hook,
          remote_host="{{ task_instance.xcom_pull(task_ids='create_pdal_instance', key = 'private-ip') }}",
          command=" ".join(["sudo", "chmod", "777", "/var/opt"])
      ) 
    2. upload taskscript

      Code Block
      languagepy
      taskscript_upload = SFTPOperator(
          executor_config=executor_config,
          # The ID specified for the task.
          task_id="taskscript_upload",
          ssh_hook=ssh_hook,
          remote_host="{{ task_instance.xcom_pull(task_ids='create_pdal_instance', key = 'private-ip') }}",
          local_filepath = os.path.dirname(os.path.realpath(__file__)) + "/drone_data_update_dag/pdal_task.py",
          remote_filepath="/var/opt/pdal_task.py",
          operation="put",
          create_intermediate_dirs=True
      )
    3. run taskscript with VM

      Code Block
      languagepy
      pdal_trans = SSHOperator(
          executor_config = executor_config,
          # The ID specified for the task.
          task_id = "pdal_trans",
          ssh_hook = ssh_hook,
          remote_host = "{{ task_instance.xcom_pull(task_ids='create_pdal_instance', key = 'private-ip') }}",
          command = " ".join(["podman", "run", "--rm", "-it",
              "-v", "/var/opt/pdal_task.py:/pdal_task.py",
              "-v", "/var/mnt/data/Q4646:/var/mnt/data/Q4646",
              "--userns=keep-id", "--group-add=keep-groups",
              "--security-opt", "label:disable",
              "docker.io/pdal/pdal:sha-3afba0f9",
              "python3", "/pdal_task.py", *pdal_params])
      )
  6. delete vm after use

    Code Block
    languagepy
    instance_id = kwargs["task_instance"].xcom_pull(task_ids='create_pdal_instance', key = 'instance-id')
    conn = openstack.connect(**auth)
    server=conn.compute.get_server(instance_id)
    conn.compute.delete_server(server)

...