You can deploy a configurable JupyterHub installation that expects the add-on environment variables from the add-on and automatically configures a client that returns a pandas data frame.
import grax_athena
df = grax_athena.query_data_lake("SELECT COUNT(*) FROM object_account")
Connecting from Python
You can use any Athena client to connect to and query the data lake. Here is an example using the pyathena client:
import os
from pyathena import connect
# Retrieve AWS credentials from environment variables
aws_access_key_id = os.environ.get('GRAX_AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('GRAX_AWS_SECRET_ACCESS_KEY')
aws_region = os.environ.get('GRAX_AWS_REGION', 'us-east-1')
# Athena connection parameters
s3_staging_dir = os.environ.get('GRAX_S3_STAGING_DIR')
athena_database = os.environ.get('GRAX_ATHENA_DATABASE')
athena_workgroup = os.environ.get('GRAX_ATHENA_WORKGROUP')
# Establish connection to Athena
conn = connect(aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
s3_staging_dir=s3_staging_dir,
work_group=athena_workgroup,
region_name=aws_region)
print("Connection to Athena established successfully.")
# Execute the query
cursor = conn.cursor()
query = f"SELECT COUNT(*) FROM {athena_database}.\"object_account\""
cursor.execute(query)
# Fetch and print the result
result = cursor.fetchone()
count = result[0]
print(f"Number of rows in object_account: {count}")