Welcome to TERN Knowledge Base

Skip to end of metadata
Go to start of metadata

You are viewing an old version of this page. View the current version.

Compare with Current View Page History

« Previous Version 5 Next »

  1. configure connection to nectar server

  2. write and pack the user data

    1. write the user data in a .fcc file

      variant: fcos
      version: 1.4.0
      passwd:
        # set user and group for operate on NFS
        groups:
          - name: Q4646RW
            gid: 544646
        users:
          - name: core
            groups:
              - Q4646RW
              # core, adm, wheel, sudo, systemd-journal
      storage:
          # write content into file
          files:
            # disable auto updates by config
            - path: /etc/zincati/config.d/90-disable-auto-updates.toml
              contents:
                inline: |
                  [updates]
                  enabled = false
      systemd:
          units:
              # disabling zincati service only works in fcos 36-20220522.2.1 upwards
              - name: zincati.service
                enabled: false
              # mount NFS onto VM
              - name: var-mnt-data-Q4646.mount
                contents: |
                  [Unit]
                  Description = /var/mnt/data/Q4646
                  After=network-online.target
                  Wants=network-online.target
      
                  [Mount]
                  What = 10.255.122.28:/gpfs/general02/pool9200/Q4646/Q4646
                  Where = /var/mnt/data/Q4646
                  Type=nfs
                  Options= rw,hard,nfsvers=3,nosuid,nodev,noresvport
                  TimeoutSec=5
      
                  [Install]
                  WantedBy=multi-user.target
              - name: var-mnt-data-Q4646.automount
                enabled: true
                contents: |
                  [Unit]
                  Description=/var/mnt/data/Q4646
      
                  [Automount]
                  Where=/var/mnt/data/Q4646
                  TimeoutIdleSec=0
      
                  [Install]
                  WantedBy=multi-user.target
      
    2. generate ign user data file from fcc. docker installed required.

      # fcc file location: C:\Users\uqtlan\fcos.fcc
      docker run --rm -i -v C:\Users\uqtlan:/var/opt -w /var/opt \
      quay.io/coreos/fcct:v0.18.0 --pretty --strict --output fcos.ign fcos.fcc
    3. read the user data file as string and base64-encode it

  3. specify the parameters for the VM

    1. name: name of the instance. drone_pdal_translate

    2. port: network port suffix for mounting NFS.
      "-test(-prod)" for port NFSPort-02-test(NFSPort-02-prod)

    3. image: image for the instance.
      tern-fedora-core-38 for running podman directly

    4. nets: list of network names. ["qld"]

    5. security_groups: list of security group names. ["default", "ssh_public"]

    6. availability_zone: availability zone for the VM. QRISCloud

    7. key: name of the keypair for the vm. ansible

    8. flavor: name of the flavor for creating VM. r3.large

  4. create VM in python by openstacksdk (and push the instance id and ip to xcom)

    conn = openstack.connect(**auth)
    
    # user_data needs to be base64 encoded
    with open(user_data_file, 'r') as file:
        user_data_raw = file.read()
    user_data = base64.b64encode(user_data_raw.encode("utf-8")).decode('utf-8')
    
    # check whether security groups exist
    security_groups = []
    for security_group in security_groups_list:
        sg = conn.network.find_security_group(security_group)
        if sg:
            log.info('adding security group {}:{}'.format(sg.name,sg.id))
            security_groups.append({"name":sg.name})
        else:
            raise Exception('security group {} not found'.format(security_group))
    
    # check whether image exists
    im = conn.image.find_image(image)
    if im:
        log.info('using image {}:{}'.format(image,im.id))
    else:
        raise Exception('image {} not found'.format(image))
    
    # check whether flavor exists
    fl = conn.compute.find_flavor(flavor)
    if fl:
        log.info('using flavor {}:{}'.format(flavor,fl.id))
    else:
        raise Exception('flavor {} not found'.format(flavor))
    
    # find an available port
    port = None
    for num in range(1,6):
        suf = port_suf
        _port = conn.network.find_port(f"nfs-port-{num:02}{suf}")
        if _port and not _port.device_id:
            port = _port
            print(f'using port {port.name}:{port.id}')
            break
    if not port:
        log.warn("no available port for NFS. using direct mounting instead, which might cause error.")
    
    # nics = [{'uuid': some_net.id},{'port': port.id}]
    nics = []
    if port:
        log.info('appending port {}:{}'.format(port.name,port.id))
        nics.append({'port': port.id})
    for net_name in nets:
        # check whether network exists
        net = conn.network.find_network(net_name)
        if net:
            log.info('appending network {}:{}'.format(net_name,net.id))
            nics.append({'uuid': net.id})
        else:
            log.warn('net {} not found'.format(net))
    
    conn.compute.create_server(name = name,
                                image_id = im.id,
                                flavor_id = fl.id,
                                security_groups = security_groups,
                                availability_zone = availability_zone,
                                key_name = key,
                                networks = nics,
                                user_data = user_data
                                )
    
    log.info("Sleeping for 5s after create command")
    time.sleep(5)
    
    try:
        check_instance = conn.compute.find_server(name)
        while check_instance.status != 'ACTIVE' and check_instance.status != 'ERROR':
            log.info("Building instance... please wait...")
            time.sleep(30)
            check_instance = conn.compute.get_server(check_instance.id)
    except:
        raise Exception("Error building instance {}. Probably a nectar outage.".format(check_instance.name))
    
    if check_instance.status == 'ERROR':
        raise Exception("Error building instance {}. Please check OpenStack logs: {}".format(check_instance.name,check_instance.fault))
    
    log.info("VM running... status=" + conn.compute.get_server(check_instance.id).status)
    
    log.info("Instance {}:{} created and running".format(check_instance.name, check_instance.id))
    # wait a bit for VM building up
    time.sleep(30)
    # push instance id and ip address
    kwargs["task_instance"].xcom_push(key = "instance-id", value = check_instance.id)
    kwargs["task_instance"].xcom_push(
        key="private-ip", value = check_instance.addresses['qld'][0]["addr"]
    )

  5. upload taskscripts. skip this step if taskscript is written in user data

    1. grant write permission of the directory for uploading taskscripts.

      ssh_hook = SSHHook("ssh_ecoplots")
      ssh_hook.username = "core"
      grant_write_permission = SSHOperator(
          executor_config=executor_config,
          # The ID specified for the task.
          task_id="grant_write_permission",
          ssh_hook=ssh_hook,
          remote_host="{{ task_instance.xcom_pull(task_ids='create_pdal_instance', key = 'private-ip') }}",
          command=" ".join(["sudo", "chmod", "777", "/var/opt"])
      ) 
    2. upload taskscript

      taskscript_upload = SFTPOperator(
          executor_config=executor_config,
          # The ID specified for the task.
          task_id="taskscript_upload",
          ssh_hook=ssh_hook,
          remote_host="{{ task_instance.xcom_pull(task_ids='create_pdal_instance', key = 'private-ip') }}",
          local_filepath = os.path.dirname(os.path.realpath(__file__)) + "/drone_data_update_dag/pdal_task.py",
          remote_filepath="/var/opt/pdal_task.py",
          operation="put",
          create_intermediate_dirs=True
      )
    3. run taskscript with VM

      pdal_trans = SSHOperator(
          executor_config = executor_config,
          # The ID specified for the task.
          task_id = "pdal_trans",
          ssh_hook = ssh_hook,
          remote_host = "{{ task_instance.xcom_pull(task_ids='create_pdal_instance', key = 'private-ip') }}",
          command = " ".join(["podman", "run", "--rm", "-it",
              "-v", "/var/opt/pdal_task.py:/pdal_task.py",
              "-v", "/var/mnt/data/Q4646:/var/mnt/data/Q4646",
              "--userns=keep-id", "--group-add=keep-groups",
              "--security-opt", "label:disable",
              "docker.io/pdal/pdal:sha-3afba0f9",
              "python3", "/pdal_task.py", *pdal_params])
      )
  6. delete vm after use

    instance_id = kwargs["task_instance"].xcom_pull(task_ids='create_pdal_instance', key = 'instance-id')
    conn = openstack.connect(**auth)
    server=conn.compute.get_server(instance_id)
    conn.compute.delete_server(server)

  • No labels

0 Comments

You are not logged in. Any changes you make will be marked as anonymous. You may want to Log In if you already have an account.