SQL for gcloud bucket

Posted on Jan 16, 2024

TLDR: https://github.com/Njorda/cloudsql/tree/main

The goal with this blog post is to build a small tool to query Google Cloud buckets. We will do this using ChatGPT, I feel like I need to be even better at prompting for coding help. So to start by setting some constraints to limit the scope and make it a reasonable task to finish in a hour we will use go and we will limit our self to SELECT with WHERE we will support predicate push downs but thats it. We also aim for creating an experiance that is similar to psql

The first step is to be able to parse queries, we will not support Common Table Expressions CTE or any other fancy feature for that part to make life easier. Initially I though to not include the exact prompts but after realising it might be tricky for people to play around with and reproduce I decided to keep them in an appendix. However in the end I changed my mind again to not do this since it was so much back and fourth with ChatGPT4. The first step in order to build my small CLI is to get the overview tasks, the steps I decided to follow for the project is the following:

Lexer or Tokenizer: The lexer takes the SQL query as input and breaks it down into tokens. Tokens are the smallest units that have meaning in SQL, like keywords (SELECT, FROM, WHERE), identifiers (table names, column names), operators (=, >, <), and literals (numeric values, strings).
Intermediate Representation: The parser typically converts the query into an intermediate representation (IR). This IR is a data structure (like a parse tree or an abstract syntax tree) that represents the parsed query. The IR is used by other components of the database system to execute the query.
Generate API call: Extract the paramters for the API call to google cloud storage.

The code will be broken down to the following structure:

├── lexer │ ├── lexer.go │ ├── lexer_test.go ├── parser │ ├── sql_parser.go │ ├── sql_parser_test.go ├── main.go ├── go.mod ├── go.sum

The code for making the API calls will live in main.

The first step is to build the parser, ChatGPT4 did most of the heavy lifting and the code only required some minor changes:

package lexer

import (
	"unicode"
)

type TokenType int

var keywords = []string{"SELECT", "FROM", "WHERE"}

const (
	TOKEN_EOF TokenType = iota
	TOKEN_ERROR
	TOKEN_KEYWORD
	TOKEN_IDENTIFIER
	TOKEN_SYMBOL
)

type Token struct {
	Type    TokenType
	Literal string
}

type Lexer struct {
	input        string
	position     int
	readPosition int
	ch           byte
}

func NewLexer(input string) *Lexer {
	l := &Lexer{input: input}
	l.readChar()
	return l
}

func (l *Lexer) readChar() {
	if l.readPosition >= len(l.input) {
		l.ch = 0
	} else {
		l.ch = l.input[l.readPosition]
	}
	l.position = l.readPosition
	l.readPosition++
}

func (l *Lexer) NextToken() Token {
	var tok Token

	l.skipWhitespace()

	switch l.ch {
	case 0:
		tok = Token{Type: TOKEN_EOF, Literal: string("")}
	case '=', ';', '(', ')', ',', '\'', '/':
		tok = Token{Type: TOKEN_SYMBOL, Literal: string(l.ch)}
		l.readChar()
	default:
		if isLetter(l.ch) {

			literal := l.readIdentifier()
			if keyword(literal) {
				tok = Token{Type: TOKEN_KEYWORD, Literal: literal}
				return tok
			}
			tok = Token{Type: TOKEN_IDENTIFIER, Literal: literal}
			return tok
		}
		tok = Token{Type: TOKEN_ERROR, Literal: string(l.ch)}
		l.readChar()
	}
	return tok
}

func keyword(literal string) bool {
	for _, keyword := range keywords {
		if keyword == literal {
			return true
		}
	}
	return false
}

func (l *Lexer) readIdentifier() string {
	position := l.position
	for isLetter(l.ch) {
		l.readChar()
	}
	return l.input[position:l.position]
}

func isLetter(ch byte) bool {
	return unicode.IsLetter(rune(ch)) || ch == '_' || ch == '%'
}

func (l *Lexer) skipWhitespace() {
	for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
		l.readChar()
	}
}

The code also contains a test which shows the output, check here. Next step is to parse the tokenized the input:

package parser

import (
	"fmt"
	"strings"

	"github.com/Njorda.cloudsql/lexer"
)

type KeyValue struct {
	Key   string
	Value string
}

type SQLQuery struct {
	Select []string
	From   string
	Where  KeyValue
	Equals KeyValue
}

// Lets do it like the parser instead!
type Parser struct {
	query lexer.Lexer
}

func NewParser(input string) *Parser {
	return &Parser{query: *lexer.NewLexer(input)}
}

func (p *Parser) nextIdentifier() string {
	for {
		tok := p.query.NextToken()
		if tok.Type == lexer.TOKEN_SYMBOL {
			continue
		}
		return tok.Literal
	}
}

// ParseSQLQuery parses a simple SQL query
func (p *Parser) ParseQuery() (*SQLQuery, error) {
	query := &SQLQuery{}
	for tok := p.query.NextToken(); tok.Type != lexer.TOKEN_EOF; tok = p.query.NextToken() {
		switch tok.Type {
		case lexer.TOKEN_KEYWORD:
			switch strings.ToUpper(tok.Literal) {
			case "SELECT":
				for tok = p.query.NextToken(); tok.Type == lexer.TOKEN_IDENTIFIER || tok.Type == lexer.TOKEN_SYMBOL; tok = p.query.NextToken() {
					switch tok.Type {
					case lexer.TOKEN_SYMBOL:
						continue
					case lexer.TOKEN_IDENTIFIER:
						query.Select = append(query.Select, tok.Literal)
					}
				}
				fallthrough
			case "FROM":
				// no inner query support
				query.From = p.query.NextToken().Literal
			// Currently only supports one where clause, either with = or %.
			case "WHERE":
				kV := KeyValue{}
				kV.Key = p.nextIdentifier()
				// Need to get all the stuff until we get the end of it.
				kV.Value = p.nextIdentifier()
			Exit:
				for tok = p.query.NextToken(); tok.Type == lexer.TOKEN_IDENTIFIER || tok.Type == lexer.TOKEN_SYMBOL; tok = p.query.NextToken() {
					switch {
					case tok.Type == lexer.TOKEN_SYMBOL && tok.Literal == `/`:
						kV.Value = fmt.Sprintf("%v%v", kV.Value, tok.Literal)
					case tok.Type == lexer.TOKEN_SYMBOL && tok.Literal == `=`:
						continue
					case tok.Type == lexer.TOKEN_IDENTIFIER:
						kV.Value = fmt.Sprintf("%v%v", kV.Value, tok.Literal)
					default:
						continue Exit
					}
				}
				if strings.HasSuffix(kV.Value, "%") {
					kV.Value = strings.TrimSuffix(kV.Value, "%")
					query.Where = kV
					continue
				}
				query.Equals = kV

			}
		}
	}
	return query, nil
}

Most of the code so far as been written baser upon the go standard libs. However for the CLI we will use a library in order to give a lot of the basic functionality such as searching previous commands and history. We will also use a library for pretty printing the results.

package main

import (
	"context"
	"fmt"
	"os"
	"strings"

	"cloud.google.com/go/storage"
	"github.com/Njorda.cloudsql/parser"
	"google.golang.org/api/iterator"

	"github.com/chzyer/readline"

	"github.com/jedib0t/go-pretty/v6/table"
)

//var columns = []string{"name", "size", "timeCreated", "timeUpdated", "storageClass", "owner", "contentType", "contentEncoding", "contentDisposition", "retentionTime", "updated"}

func handleInput(ctx context.Context, client *storage.Client, input string) error {
	query, err := parser.NewParser(input).ParseQuery()
	if err != nil {
		return err
	}

	// Name is the only value we have ...
	rows, err := ListObjects(ctx, client, query.From, query.Where.Value, query.Select)
	if err != nil {
		return err
	}
	format(query.Select, rows)
	return nil
}

// CreateClient initializes a new Google Cloud Storage client
func CreateClient(ctx context.Context) (*storage.Client, error) {
	client, err := storage.NewClient(ctx)
	if err != nil {
		return nil, err
	}
	return client, nil
}

// GetObjects lists objects in a given bucket, optionally filtered by a prefix
func GetObjects(ctx context.Context, client *storage.Client, bucketName, objet string, selected []string) ([]string, error) {
	attrs, err := client.Bucket(bucketName).Object(objet).Attrs(ctx)
	if err != nil {
		return nil, err
	}
	var objects []string
	out := map[string]string{}
	out["name"] = attrs.Name
	out["size"] = fmt.Sprintf("%d", attrs.Size)
	out["timeCreated"] = attrs.Created.String()
	out["timeUpdated"] = attrs.Updated.String()
	out["storageClass"] = attrs.StorageClass
	out["owner"] = attrs.Owner
	out["contentType"] = attrs.ContentType
	out["contentEncoding"] = attrs.ContentEncoding
	out["contentDisposition"] = attrs.ContentDisposition
	out["retentionTime"] = attrs.RetentionExpirationTime.GoString()
	out["updated"] = attrs.Updated.String()
	for _, column := range selected {
		objects = append(objects, out[column])
	}
	return objects, nil
}

// ListObjects lists objects in a given bucket, optionally filtered by a prefix
func ListObjects(ctx context.Context, client *storage.Client, bucketName, prefix string, selected []string) ([][]string, error) {
	fmt.Println("The prefix is: ", prefix)
	it := client.Bucket(bucketName).Objects(ctx, &storage.Query{Prefix: prefix})
	var rows [][]string
	for {
		attrs, err := it.Next()
		if err == iterator.Done {
			break
		}
		if err != nil {
			return nil, err
		}
		// Here we could do something with reflect to get all the stuff out!
		// This would be the way.
		out := map[string]string{}
		out["name"] = attrs.Name
		out["size"] = fmt.Sprintf("%d", attrs.Size)
		out["timeCreated"] = attrs.Created.String()
		out["timeUpdated"] = attrs.Updated.String()
		out["storageClass"] = attrs.StorageClass
		out["owner"] = attrs.Owner
		out["contentType"] = attrs.ContentType
		out["contentEncoding"] = attrs.ContentEncoding
		out["contentDisposition"] = attrs.ContentDisposition
		out["retentionTime"] = attrs.RetentionExpirationTime.GoString()
		out["updated"] = attrs.Updated.String()
		row := []string{}
		for _, column := range selected {
			row = append(row, out[column])
		}
		rows = append(rows, row)
	}
	return rows, nil
}

func format(columns []string, tuples [][]string) {
	t := table.NewWriter()
	t.SetOutputMirror(os.Stdout)
	row := table.Row{}
	for _, col := range columns {
		row = append(row, col)
	}
	t.AppendHeader(row)
	rows := []table.Row{}
	for _, tuple := range tuples {
		row := table.Row{}
		for _, val := range tuple {
			row = append(row, val)
		}
		rows = append(rows, row)
	}
	t.AppendRows(rows)
	t.AppendSeparator()
	t.Render()
}

func main() {
	ctx := context.Background()
	client, err := CreateClient(ctx) // Assuming CreateClient is a function you've defined
	if err != nil {
		panic(err)
	}

	rl, err := readline.New("GCSQL> ")
	if err != nil {
		panic(err)
	}
	defer rl.Close()

	fmt.Println("Welcome to GCSQL, the Google Cloud Storage SQL interface.")
	fmt.Println("Type 'EXIT' to quit.")

	for {
		input, err := rl.Readline()
		if err != nil { // io.EOF, readline.ErrInterrupt
			break
		}

		if strings.ToUpper(input) == "EXIT" {
			fmt.Println("Goodbye!")
			break
		}

		// Add the input to history
		rl.SaveHistory(input)

		// Handle the input
		if err := handleInput(ctx, client, input); err != nil {
			fmt.Printf("Error: %v", err)
		}
	}
}

In order to build the binary and try the CLI out go build -o cloudsql and then ./cloudsql:

GCSQL> SELECT name, size FROM ceedai WHERE path = 'ceedai/files/%';
+---------------------------------------------------------------------------------+----------+
| NAME                                                                            | SIZE     |
+---------------------------------------------------------------------------------+----------+
| ceedai/files/tmp1.pdf                                                           | 3372362  |
| ceedai/files/smp.pdf                                                            | 989059   |
| ceedai/files/12.pdf                                                             | 72202    |
| ceedai/files/p29-neumann-cidr20.pdf                                             | 335856   |
| ceedai/files/simon.json                                                         | 30       |
+---------------------------------------------------------------------------------+----------+
GCSQL>

The current implementation is very limited but demonstrates how it can be implemented. Feel free to fork the code and add features. I might come back in later posts and add more features.