PostgreSQL Essentials & pgvector

Dec 12, 2024 · 10 min read · database-knowhow postgresql pgvector sql ·

Share on:

PostgreSQL advanced features including procedures, views, pgvector for vector similarity search, and Docker setup.

Docker Setup

Docker Run

 1# Run PostgreSQL
 2docker run -d \
 3  --name postgres \
 4  -e POSTGRES_PASSWORD=password \
 5  -e POSTGRES_USER=myuser \
 6  -e POSTGRES_DB=mydb \
 7  -v postgres-data:/var/lib/postgresql/data \
 8  -p 5432:5432 \
 9  postgres:16-alpine
10
11# Connect to PostgreSQL
12docker exec -it postgres psql -U myuser -d mydb
13
14# With pgvector
15docker run -d \
16  --name postgres-vector \
17  -e POSTGRES_PASSWORD=password \
18  -v postgres-data:/var/lib/postgresql/data \
19  -p 5432:5432 \
20  pgvector/pgvector:pg16

Docker Compose

 1version: '3.8'
 2
 3services:
 4  postgres:
 5    image: postgres:16-alpine
 6    container_name: postgres
 7    environment:
 8      POSTGRES_USER: myuser
 9      POSTGRES_PASSWORD: password
10      POSTGRES_DB: mydb
11    volumes:
12      - postgres-data:/var/lib/postgresql/data
13      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
14    ports:
15      - "5432:5432"
16    healthcheck:
17      test: ["CMD-SHELL", "pg_isready -U myuser"]
18      interval: 10s
19      timeout: 5s
20      retries: 5
21    restart: unless-stopped
22
23  # PostgreSQL with pgvector
24  postgres-vector:
25    image: pgvector/pgvector:pg16
26    container_name: postgres-vector
27    environment:
28      POSTGRES_USER: vectoruser
29      POSTGRES_PASSWORD: password
30      POSTGRES_DB: vectordb
31    volumes:
32      - postgres-vector-data:/var/lib/postgresql/data
33    ports:
34      - "5433:5432"
35    restart: unless-stopped
36
37  # pgAdmin (optional)
38  pgadmin:
39    image: dpage/pgadmin4
40    container_name: pgadmin
41    environment:
42      PGADMIN_DEFAULT_EMAIL: admin@example.com
43      PGADMIN_DEFAULT_PASSWORD: admin
44    ports:
45      - "5050:80"
46    depends_on:
47      - postgres
48    restart: unless-stopped
49
50volumes:
51  postgres-data:
52  postgres-vector-data:

 1# Start services
 2docker-compose up -d
 3
 4# View logs
 5docker-compose logs -f postgres
 6
 7# Connect to database
 8docker-compose exec postgres psql -U myuser -d mydb
 9
10# Stop services
11docker-compose down
12
13# Stop and remove volumes
14docker-compose down -v

Basic Commands

 1# Connect to database
 2psql -h localhost -U myuser -d mydb
 3
 4# List databases
 5\l
 6
 7# Connect to database
 8\c mydb
 9
10# List tables
11\dt
12
13# Describe table
14\d table_name
15
16# List views
17\dv
18
19# List functions
20\df
21
22# List schemas
23\dn
24
25# Execute SQL file
26\i /path/to/file.sql
27
28# Export query results to CSV
29\copy (SELECT * FROM users) TO '/path/to/users.csv' CSV HEADER
30
31# Quit
32\q

Views

 1-- Create view
 2CREATE VIEW user_orders AS
 3SELECT 
 4    u.id,
 5    u.username,
 6    u.email,
 7    COUNT(o.id) AS order_count,
 8    SUM(o.total) AS total_spent
 9FROM users u
10LEFT JOIN orders o ON u.id = o.user_id
11GROUP BY u.id, u.username, u.email;
12
13-- Query view
14SELECT * FROM user_orders WHERE order_count > 5;
15
16-- Materialized view (cached results)
17CREATE MATERIALIZED VIEW user_stats AS
18SELECT 
19    u.id,
20    u.username,
21    COUNT(o.id) AS order_count,
22    AVG(o.total) AS avg_order_value
23FROM users u
24LEFT JOIN orders o ON u.id = o.user_id
25GROUP BY u.id, u.username;
26
27-- Refresh materialized view
28REFRESH MATERIALIZED VIEW user_stats;
29
30-- Refresh concurrently (doesn't lock)
31REFRESH MATERIALIZED VIEW CONCURRENTLY user_stats;
32
33-- Drop view
34DROP VIEW user_orders;
35DROP MATERIALIZED VIEW user_stats;
36
37-- Replace view
38CREATE OR REPLACE VIEW user_orders AS
39SELECT u.id, u.username, COUNT(o.id) AS order_count
40FROM users u
41LEFT JOIN orders o ON u.id = o.user_id
42GROUP BY u.id, u.username;

Stored Procedures & Functions

Functions

 1-- Simple function
 2CREATE OR REPLACE FUNCTION get_user_order_count(user_id_param INTEGER)
 3RETURNS INTEGER AS $$
 4DECLARE
 5    order_count INTEGER;
 6BEGIN
 7    SELECT COUNT(*) INTO order_count
 8    FROM orders
 9    WHERE user_id = user_id_param;
10    
11    RETURN order_count;
12END;
13$$ LANGUAGE plpgsql;
14
15-- Usage
16SELECT get_user_order_count(1);
17
18-- Function returning table
19CREATE OR REPLACE FUNCTION get_high_value_customers(min_spent DECIMAL)
20RETURNS TABLE(
21    user_id INTEGER,
22    username VARCHAR,
23    total_spent DECIMAL
24) AS $$
25BEGIN
26    RETURN QUERY
27    SELECT 
28        u.id,
29        u.username,
30        SUM(o.total) AS total
31    FROM users u
32    INNER JOIN orders o ON u.id = o.user_id
33    GROUP BY u.id, u.username
34    HAVING SUM(o.total) >= min_spent;
35END;
36$$ LANGUAGE plpgsql;
37
38-- Usage
39SELECT * FROM get_high_value_customers(1000.00);
40
41-- Function with exception handling
42CREATE OR REPLACE FUNCTION safe_divide(a DECIMAL, b DECIMAL)
43RETURNS DECIMAL AS $$
44BEGIN
45    IF b = 0 THEN
46        RAISE EXCEPTION 'Division by zero';
47    END IF;
48    RETURN a / b;
49EXCEPTION
50    WHEN division_by_zero THEN
51        RETURN NULL;
52END;
53$$ LANGUAGE plpgsql;

Procedures

 1-- Stored procedure
 2CREATE OR REPLACE PROCEDURE process_order(
 3    p_user_id INTEGER,
 4    p_total DECIMAL
 5)
 6LANGUAGE plpgsql
 7AS $$
 8DECLARE
 9    v_order_id INTEGER;
10BEGIN
11    -- Insert order
12    INSERT INTO orders (user_id, total, status)
13    VALUES (p_user_id, p_total, 'pending')
14    RETURNING id INTO v_order_id;
15    
16    -- Update user stats
17    UPDATE user_stats
18    SET order_count = order_count + 1,
19        total_spent = total_spent + p_total
20    WHERE user_id = p_user_id;
21    
22    -- Log
23    INSERT INTO audit_log (action, details)
24    VALUES ('order_created', 'Order ' || v_order_id || ' created');
25    
26    COMMIT;
27END;
28$$;
29
30-- Call procedure
31CALL process_order(1, 99.99);
32
33-- Procedure with transaction control
34CREATE OR REPLACE PROCEDURE transfer_funds(
35    from_account INTEGER,
36    to_account INTEGER,
37    amount DECIMAL
38)
39LANGUAGE plpgsql
40AS $$
41BEGIN
42    -- Start transaction
43    BEGIN
44        -- Deduct from source
45        UPDATE accounts
46        SET balance = balance - amount
47        WHERE id = from_account;
48        
49        -- Check if sufficient funds
50        IF NOT FOUND OR (SELECT balance FROM accounts WHERE id = from_account) < 0 THEN
51            RAISE EXCEPTION 'Insufficient funds';
52        END IF;
53        
54        -- Add to destination
55        UPDATE accounts
56        SET balance = balance + amount
57        WHERE id = to_account;
58        
59        COMMIT;
60    EXCEPTION
61        WHEN OTHERS THEN
62            ROLLBACK;
63            RAISE;
64    END;
65END;
66$$;

Triggers

 1-- Create trigger function
 2CREATE OR REPLACE FUNCTION update_modified_timestamp()
 3RETURNS TRIGGER AS $$
 4BEGIN
 5    NEW.updated_at = CURRENT_TIMESTAMP;
 6    RETURN NEW;
 7END;
 8$$ LANGUAGE plpgsql;
 9
10-- Create trigger
11CREATE TRIGGER users_updated_at
12BEFORE UPDATE ON users
13FOR EACH ROW
14EXECUTE FUNCTION update_modified_timestamp();
15
16-- Audit trigger
17CREATE OR REPLACE FUNCTION audit_user_changes()
18RETURNS TRIGGER AS $$
19BEGIN
20    IF TG_OP = 'INSERT' THEN
21        INSERT INTO audit_log (table_name, action, new_data)
22        VALUES ('users', 'INSERT', row_to_json(NEW));
23    ELSIF TG_OP = 'UPDATE' THEN
24        INSERT INTO audit_log (table_name, action, old_data, new_data)
25        VALUES ('users', 'UPDATE', row_to_json(OLD), row_to_json(NEW));
26    ELSIF TG_OP = 'DELETE' THEN
27        INSERT INTO audit_log (table_name, action, old_data)
28        VALUES ('users', 'DELETE', row_to_json(OLD));
29    END IF;
30    RETURN NULL;
31END;
32$$ LANGUAGE plpgsql;
33
34CREATE TRIGGER audit_users
35AFTER INSERT OR UPDATE OR DELETE ON users
36FOR EACH ROW
37EXECUTE FUNCTION audit_user_changes();

pgvector - Vector Similarity Search

Installation

1-- Enable extension
2CREATE EXTENSION IF NOT EXISTS vector;

Create Table with Vector Column

 1-- Create table for embeddings
 2CREATE TABLE documents (
 3    id SERIAL PRIMARY KEY,
 4    title TEXT NOT NULL,
 5    content TEXT NOT NULL,
 6    embedding vector(1536),  -- OpenAI ada-002 dimension
 7    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 8);
 9
10-- Create index for similarity search
11CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
12
13-- Or use HNSW index (better for high-dimensional vectors)
14CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops);

Insert Vectors

 1-- Insert document with embedding
 2INSERT INTO documents (title, content, embedding)
 3VALUES (
 4    'Machine Learning Basics',
 5    'Introduction to ML concepts...',
 6    '[0.1, 0.2, 0.3, ...]'::vector  -- 1536 dimensions
 7);
 8
 9-- Insert from Python
10-- import psycopg2
11-- from openai import OpenAI
12-- 
13-- client = OpenAI()
14-- response = client.embeddings.create(
15--     model="text-embedding-ada-002",
16--     input="Your text here"
17-- )
18-- embedding = response.data[0].embedding
19-- 
20-- cursor.execute(
21--     "INSERT INTO documents (title, content, embedding) VALUES (%s, %s, %s)",
22--     (title, content, embedding)
23-- )

Similarity Search

 1-- Cosine similarity (most common)
 2SELECT 
 3    id,
 4    title,
 5    1 - (embedding <=> '[0.1, 0.2, ...]'::vector) AS similarity
 6FROM documents
 7ORDER BY embedding <=> '[0.1, 0.2, ...]'::vector
 8LIMIT 10;
 9
10-- L2 distance (Euclidean)
11SELECT 
12    id,
13    title,
14    embedding <-> '[0.1, 0.2, ...]'::vector AS distance
15FROM documents
16ORDER BY embedding <-> '[0.1, 0.2, ...]'::vector
17LIMIT 10;
18
19-- Inner product
20SELECT 
21    id,
22    title,
23    (embedding <#> '[0.1, 0.2, ...]'::vector) * -1 AS similarity
24FROM documents
25ORDER BY embedding <#> '[0.1, 0.2, ...]'::vector
26LIMIT 10;
27
28-- With filters
29SELECT 
30    id,
31    title,
32    1 - (embedding <=> '[0.1, 0.2, ...]'::vector) AS similarity
33FROM documents
34WHERE created_at > '2024-01-01'
35  AND similarity > 0.8
36ORDER BY embedding <=> '[0.1, 0.2, ...]'::vector
37LIMIT 10;

Python Integration

 1import psycopg2
 2from openai import OpenAI
 3import numpy as np
 4
 5# Connect to PostgreSQL
 6conn = psycopg2.connect(
 7    host="localhost",
 8    database="vectordb",
 9    user="vectoruser",
10    password="password"
11)
12cursor = conn.cursor()
13
14# Initialize OpenAI
15client = OpenAI(api_key="your-api-key")
16
17def get_embedding(text):
18    """Get embedding from OpenAI"""
19    response = client.embeddings.create(
20        model="text-embedding-ada-002",
21        input=text
22    )
23    return response.data[0].embedding
24
25def insert_document(title, content):
26    """Insert document with embedding"""
27    embedding = get_embedding(content)
28    cursor.execute(
29        """
30        INSERT INTO documents (title, content, embedding)
31        VALUES (%s, %s, %s)
32        RETURNING id
33        """,
34        (title, content, embedding)
35    )
36    doc_id = cursor.fetchone()[0]
37    conn.commit()
38    return doc_id
39
40def search_similar(query, limit=10):
41    """Search for similar documents"""
42    query_embedding = get_embedding(query)
43    cursor.execute(
44        """
45        SELECT 
46            id,
47            title,
48            content,
49            1 - (embedding <=> %s::vector) AS similarity
50        FROM documents
51        ORDER BY embedding <=> %s::vector
52        LIMIT %s
53        """,
54        (query_embedding, query_embedding, limit)
55    )
56    return cursor.fetchall()
57
58# Usage
59doc_id = insert_document(
60    "Introduction to Neural Networks",
61    "Neural networks are computing systems inspired by biological neural networks..."
62)
63
64results = search_similar("What are neural networks?")
65for id, title, content, similarity in results:
66    print(f"{title} (similarity: {similarity:.4f})")

Advanced Features

JSON/JSONB

 1-- Create table with JSONB
 2CREATE TABLE products (
 3    id SERIAL PRIMARY KEY,
 4    name VARCHAR(200),
 5    attributes JSONB
 6);
 7
 8-- Insert JSON data
 9INSERT INTO products (name, attributes) VALUES
10    ('Laptop', '{"brand": "Dell", "ram": "16GB", "storage": "512GB SSD"}'),
11    ('Phone', '{"brand": "Apple", "model": "iPhone 14", "color": "black"}');
12
13-- Query JSON
14SELECT * FROM products WHERE attributes->>'brand' = 'Dell';
15SELECT * FROM products WHERE attributes->'ram' = '"16GB"';
16
17-- JSON operators
18SELECT attributes->'brand' FROM products;  -- Get JSON object
19SELECT attributes->>'brand' FROM products;  -- Get text
20SELECT attributes#>'{specs,ram}' FROM products;  -- Get nested
21
22-- JSON functions
23SELECT jsonb_array_elements(attributes->'tags') FROM products;
24SELECT jsonb_object_keys(attributes) FROM products;
25
26-- Index on JSON
27CREATE INDEX idx_brand ON products ((attributes->>'brand'));

Full-Text Search

 1-- Create table
 2CREATE TABLE articles (
 3    id SERIAL PRIMARY KEY,
 4    title TEXT,
 5    content TEXT,
 6    search_vector tsvector
 7);
 8
 9-- Generate search vector
10UPDATE articles
11SET search_vector = 
12    setweight(to_tsvector('english', COALESCE(title, '')), 'A') ||
13    setweight(to_tsvector('english', COALESCE(content, '')), 'B');
14
15-- Create GIN index
16CREATE INDEX idx_search ON articles USING GIN(search_vector);
17
18-- Search
19SELECT title, ts_rank(search_vector, query) AS rank
20FROM articles, to_tsquery('english', 'postgresql & database') query
21WHERE search_vector @@ query
22ORDER BY rank DESC;
23
24-- Auto-update trigger
25CREATE TRIGGER articles_search_update
26BEFORE INSERT OR UPDATE ON articles
27FOR EACH ROW EXECUTE FUNCTION
28tsvector_update_trigger(search_vector, 'pg_catalog.english', title, content);

Partitioning

 1-- Create partitioned table
 2CREATE TABLE orders (
 3    id SERIAL,
 4    user_id INTEGER,
 5    total DECIMAL(10, 2),
 6    created_at TIMESTAMP
 7) PARTITION BY RANGE (created_at);
 8
 9-- Create partitions
10CREATE TABLE orders_2023 PARTITION OF orders
11FOR VALUES FROM ('2023-01-01') TO ('2024-01-01');
12
13CREATE TABLE orders_2024 PARTITION OF orders
14FOR VALUES FROM ('2024-01-01') TO ('2025-01-01');
15
16-- Insert automatically routes to correct partition
17INSERT INTO orders (user_id, total, created_at)
18VALUES (1, 99.99, '2024-06-15');

Performance Tips

 1-- Analyze query performance
 2EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'test@example.com';
 3
 4-- Create indexes
 5CREATE INDEX idx_email ON users(email);
 6CREATE INDEX idx_created_at ON users(created_at);
 7CREATE INDEX idx_composite ON users(email, created_at);
 8
 9-- Partial index
10CREATE INDEX idx_active_users ON users(email) WHERE active = true;
11
12-- Vacuum and analyze
13VACUUM ANALYZE users;
14
15-- Show slow queries
16SELECT query, calls, total_time, mean_time
17FROM pg_stat_statements
18ORDER BY mean_time DESC
19LIMIT 10;

Related Snippets

Database Performance & SQL vs NoSQL
Database performance issues, solutions, and SQL vs NoSQL comparison. Common …
MongoDB Essentials
MongoDB queries, aggregation, and Docker setup for document-based NoSQL …
MySQL Essentials
MySQL commands, optimization, and Docker setup. Docker Setup Docker Run 1# Run …
Neo4j Graph Database
Neo4j graph database queries, algorithms, and vector search capabilities. Docker …
SQL Language Basics
SQL language fundamentals - essential commands and patterns for relational …
SQL vs NoSQL Comparison
Comprehensive comparison of SQL and NoSQL databases to help choose the right …
SQLite Essentials
SQLite commands for embedded database applications. Basic Commands 1# …