Start CloudSQL Proxy on Python Dataflow / Apache Beam

I managed to find better or at least easier solution. In DoFn setup function use cloud proxy to setup pre connection

class MyDoFn(beam.DoFn):
 def setup(self):
    os.system("wget -O cloud_sql_proxy")
    os.system("chmod +x cloud_sql_proxy")
    os.system(f"./cloud_sql_proxy -instances={self.sql_args['cloud_sql_connection_name']}=tcp:3306 &")

Workaround Solution:

I finally found a workaround. I took the idea to connect via the public IP of the CloudSQL instance. For that you needed to allow connections to your CloudSQL instance from every IP:

  1. Go to the overview page of your CloudSQL instance in GCP
  2. Click on the Authorization tab
  3. Click on Add network and add (!! this will allow every IP address to connect to your instance !!)

To add security to the process, I used SSL keys and only allowed SSL connections to the instance:

  1. Click on SSL tab
  2. Click on Create a new certificate to create a SSL certificate for your server
  3. Click on Create a client certificate to create a SSL certificate for you client
  4. Click on Allow only SSL connections to reject all none SSL connection attempts

After that I stored the certificates in a Google Cloud Storage bucket and load them before connecting within the Dataflow job, i.e.:

import psycopg2
import psycopg2.extensions
import os
import stat
from import storage

# Function to wait for open connection when processing parallel
def wait(conn):
    while 1:
        state = conn.poll()
        if state == psycopg2.extensions.POLL_OK:
        elif state == psycopg2.extensions.POLL_WRITE:
  [], [conn.fileno()], [])
        elif state == psycopg2.extensions.POLL_READ:
  [conn.fileno()], [], [])
            raise psycopg2.OperationalError("poll() returned %s" % state)

# Function which returns a connection which can be used for queries
def connect_to_db(host, hostaddr, dbname, user, password, sslmode = 'verify-full'):

    # Get keys from GCS
    client = storage.Client()

    bucket = client.get_bucket(<YOUR_BUCKET_NAME>)

    os.chmod("client-key.pem", stat.S_IRWXU)

    sslrootcert = 'server-ca.pem'
    sslkey = 'client-key.pem'
    sslcert = 'client-cert.pem'

    con = psycopg2.connect(
        host = host,
        hostaddr = hostaddr,
        dbname = dbname,
        user = user,
        password = password,
        sslrootcert = sslrootcert,
        sslcert = sslcert,
        sslkey = sslkey)
    return con

I then use these functions in a custom ParDo to perform queries.
Minimal example:

import apache_beam as beam

class ReadSQLTableNames(beam.DoFn):
    parDo class to get all table names of a given cloudSQL database.
    It will return each table name.
    def __init__(self, host, hostaddr, dbname, username, password):
        super(ReadSQLTableNames, self).__init__() = host
        self.hostaddr = hostaddr
        self.dbname = dbname
        self.username = username
        self.password = password

    def process(self, element):

        # Connect do database
        con = connect_to_db(host =,
            hostaddr = self.hostaddr,
            dbname = self.dbname,
            user = self.username,
            password = self.password)
        # Wait for free connection
        # Create cursor to query data
        cur = con.cursor(cursor_factory=RealDictCursor)

        # Get all table names
        tablename as table
        FROM pg_tables
        WHERE schemaname = 'public'
        table_names = cur.fetchall()

        for table_name in table_names:
            yield table_name["table"]

A part of the pipeline then could look like this:

# Current workaround to query all tables: 
# Create a dummy initiator PCollection with one element
init = p        |'Begin pipeline with initiator' >> beam.Create(['All tables initializer'])

tables = init   |'Get table names' >> beam.ParDo(ReadSQLTableNames(
                                                host =,
                                                hostaddr = known_args.hostaddr,
                                                dbname = known_args.db_name,
                                                username = known_args.user,
                                                password = known_args.password))

I hope this solution helps others with similar problems