10-性能优化技术

性能优化是编写高效程序的关键。优化应基于profiling数据，遵循”测量-优化-验证”循环，避免过早优化。

编译器优化

现代编译器自带强大优化能力。选择合适优化级别和选项可显著提升性能，无需修改代码。

编译选项优化

编译优化分多个级别：-O2 平衡性能和编译时间，-O3 激进优化，-Os 优化体积。-march=native 针对当前CPU优化。

// 编译时优化选项
// -O0: 无优化（调试用）
// -O1: 基本优化
// -O2: 推荐优化级别
// -O3: 激进优化
// -Os: 优化代码大小
// -Ofast: 最快执行速度

// 示例编译命令
// g++ -O2 -march=native -mtune=native -flto source.cpp

// 链接时优化
// g++ -O2 -flto source1.cpp source2.cpp

内联函数

内联函数在调用处展开，消除函数调用开销。适合短小、频繁调用的函数。编译器根据代码复杂度决定是否真正内联。

// 内联函数定义
inline int add(int a, int b) {
    return a + b;
}

// 类内定义自动内联
class Calculator {
public:
    int multiply(int a, int b) {  // 自动内联
        return a * b;
    }
};

// 模板函数通常内联
template<typename T>
T maximum(T a, T b) {
    return (a > b) ? a : b;
}

// 强制内联（不推荐）
inline __attribute__((always_inline)) int fastAdd(int a, int b) {
    return a + b;
}

循环优化

循环是性能热点。循环展开减少分支判断，循环不变式外提避免重复计算，缓存循环边界减少函数调用。

// 循环展开
void unrolledLoop(int* arr, int size) {
    for (int i = 0; i < size; i += 4) {
        arr[i] *= 2;
        arr[i + 1] *= 2;
        arr[i + 2] *= 2;
        arr[i + 3] *= 2;
    }
}

// 循环不变式外提
void optimizedLoop(int* arr, int size, int factor) {
    // factor在循环中不变，编译器会自动外提
    for (int i = 0; i < size; ++i) {
        arr[i] *= factor;
    }
}

// 减少循环中的函数调用
void inefficientLoop(std::vector<int>& vec) {
    for (size_t i = 0; i < vec.size(); ++i) {  // 每次调用size()
        vec[i] *= 2;
    }
}

void efficientLoop(std::vector<int>& vec) {
    size_t size = vec.size();  // 只调用一次
    for (size_t i = 0; i < size; ++i) {
        vec[i] *= 2;
    }
}

内存优化

内存对齐

#include <cstddef>

// 结构体对齐优化
struct UnoptimizedStruct {
    char c;      // 1字节
    int i;       // 4字节，需要3字节填充
    char c2;     // 1字节，需要3字节填充
    double d;    // 8字节
};
// 总大小：24字节

struct OptimizedStruct {
    double d;    // 8字节
    int i;       // 4字节
    char c;      // 1字节
    char c2;     // 1字节
    // 2字节填充
};
// 总大小：16字节

// 强制对齐
struct alignas(16) AlignedStruct {
    double data[2];
};

// 查询对齐要求
void checkAlignment() {
    std::cout << "int alignment: " << alignof(int) << std::endl;
    std::cout << "double alignment: " << alignof(double) << std::endl;
}

缓存友好设计

// 缓存友好的数据结构
class CacheFriendlyMatrix {
private:
    std::vector<double> data;
    size_t rows, cols;
    
public:
    CacheFriendlyMatrix(size_t r, size_t c) 
        : rows(r), cols(c), data(r * c) {}
    
    // 行优先存储，访问连续内存
    double& operator()(size_t row, size_t col) {
        return data[row * cols + col];
    }
    
    const double& operator()(size_t row, size_t col) const {
        return data[row * cols + col];
    }
    
    // 缓存友好的矩阵乘法
    CacheFriendlyMatrix multiply(const CacheFriendlyMatrix& other) const {
        CacheFriendlyMatrix result(rows, other.cols);
        
        // 分块乘法，提高缓存命中率
        const size_t blockSize = 64;
        
        for (size_t i = 0; i < rows; i += blockSize) {
            for (size_t j = 0; j < other.cols; j += blockSize) {
                for (size_t k = 0; k < cols; k += blockSize) {
                    // 处理块
                    for (size_t ii = i; ii < std::min(i + blockSize, rows); ++ii) {
                        for (size_t jj = j; jj < std::min(j + blockSize, other.cols); ++jj) {
                            double sum = 0.0;
                            for (size_t kk = k; kk < std::min(k + blockSize, cols); ++kk) {
                                sum += (*this)(ii, kk) * other(kk, jj);
                            }
                            result(ii, jj) += sum;
                        }
                    }
                }
            }
        }
        
        return result;
    }
};

内存池

#include <memory>
#include <vector>

template<typename T>
class MemoryPool {
private:
    struct Block {
        alignas(T) char data[sizeof(T)];
        bool inUse;
    };
    
    std::vector<Block> blocks;
    size_t nextFree;
    
public:
    MemoryPool(size_t initialSize = 1024) : nextFree(0) {
        blocks.resize(initialSize);
    }
    
    T* allocate() {
        // 查找空闲块
        for (size_t i = nextFree; i < blocks.size(); ++i) {
            if (!blocks[i].inUse) {
                blocks[i].inUse = true;
                nextFree = i + 1;
                return reinterpret_cast<T*>(blocks[i].data);
            }
        }
        
        // 扩展池
        size_t oldSize = blocks.size();
        blocks.resize(oldSize * 2);
        
        blocks[oldSize].inUse = true;
        nextFree = oldSize + 1;
        return reinterpret_cast<T*>(blocks[oldSize].data);
    }
    
    void deallocate(T* ptr) {
        for (auto& block : blocks) {
            if (reinterpret_cast<T*>(block.data) == ptr) {
                block.inUse = false;
                if (&block - &blocks[0] < nextFree) {
                    nextFree = &block - &blocks[0];
                }
                break;
            }
        }
    }
};

算法优化

时间复杂度优化

// 原始O(n²)算法
int findMaxSumSlow(const std::vector<int>& arr) {
    int maxSum = INT_MIN;
    for (size_t i = 0; i < arr.size(); ++i) {
        for (size_t j = i; j < arr.size(); ++j) {
            int sum = 0;
            for (size_t k = i; k <= j; ++k) {
                sum += arr[k];
            }
            maxSum = std::max(maxSum, sum);
        }
    }
    return maxSum;
}

// 优化为O(n)算法（Kadane算法）
int findMaxSumFast(const std::vector<int>& arr) {
    int maxSoFar = arr[0];
    int maxEndingHere = arr[0];
    
    for (size_t i = 1; i < arr.size(); ++i) {
        maxEndingHere = std::max(arr[i], maxEndingHere + arr[i]);
        maxSoFar = std::max(maxSoFar, maxEndingHere);
    }
    
    return maxSoFar;
}

空间复杂度优化

// 原始算法：O(n)空间
int fibonacciSlow(int n) {
    if (n <= 1) return n;
    
    std::vector<int> dp(n + 1);
    dp[0] = 0;
    dp[1] = 1;
    
    for (int i = 2; i <= n; ++i) {
        dp[i] = dp[i - 1] + dp[i - 2];
    }
    
    return dp[n];
}

// 优化算法：O(1)空间
int fibonacciFast(int n) {
    if (n <= 1) return n;
    
    int prev2 = 0, prev1 = 1;
    for (int i = 2; i <= n; ++i) {
        int current = prev1 + prev2;
        prev2 = prev1;
        prev1 = current;
    }
    
    return prev1;
}

数据结构优化

高效容器选择

#include <unordered_map>
#include <unordered_set>
#include <deque>

// 根据使用场景选择合适的数据结构
class OptimizedDataStructures {
public:
    // 频繁查找：使用unordered_map
    void frequentLookup() {
        std::unordered_map<int, std::string> map;
        // O(1)平均查找时间
    }
    
    // 有序数据：使用map
    void orderedData() {
        std::map<int, std::string> map;
        // O(log n)查找时间，但保持有序
    }
    
    // 频繁插入删除：使用list
    void frequentInsertDelete() {
        std::list<int> list;
        // O(1)插入删除时间
    }
    
    // 双端操作：使用deque
    void doubleEndedOperations() {
        std::deque<int> deque;
        // O(1)双端插入删除
    }
};

自定义分配器

#include <memory>

template<typename T>
class FastAllocator {
private:
    static constexpr size_t BLOCK_SIZE = 1024;
    struct Block {
        alignas(T) char data[BLOCK_SIZE * sizeof(T)];
        size_t used;
    };
    
    std::vector<std::unique_ptr<Block>> blocks;
    
public:
    using value_type = T;
    
    T* allocate(size_t n) {
        if (n > BLOCK_SIZE) {
            return static_cast<T*>(::operator new(n * sizeof(T)));
        }
        
        // 查找可用块
        for (auto& block : blocks) {
            if (block->used + n <= BLOCK_SIZE) {
                T* ptr = reinterpret_cast<T*>(block->data + block->used * sizeof(T));
                block->used += n;
                return ptr;
            }
        }
        
        // 创建新块
        auto newBlock = std::make_unique<Block>();
        newBlock->used = n;
        T* ptr = reinterpret_cast<T*>(newBlock->data);
        blocks.push_back(std::move(newBlock));
        return ptr;
    }
    
    void deallocate(T* ptr, size_t n) {
        // 简单实现：不实际释放
        // 实际应用中需要更复杂的逻辑
    }
};

并行优化

OpenMP并行

#include <omp.h>

class ParallelOptimization {
public:
    // 并行循环
    void parallelLoop(std::vector<int>& data) {
        #pragma omp parallel for
        for (size_t i = 0; i < data.size(); ++i) {
            data[i] *= 2;
        }
    }
    
    // 并行归约
    int parallelSum(const std::vector<int>& data) {
        int sum = 0;
        #pragma omp parallel for reduction(+:sum)
        for (size_t i = 0; i < data.size(); ++i) {
            sum += data[i];
        }
        return sum;
    }
    
    // 并行矩阵乘法
    void parallelMatrixMultiply(const std::vector<std::vector<double>>& A,
                               const std::vector<std::vector<double>>& B,
                               std::vector<std::vector<double>>& C) {
        size_t n = A.size();
        
        #pragma omp parallel for
        for (size_t i = 0; i < n; ++i) {
            for (size_t j = 0; j < n; ++j) {
                C[i][j] = 0.0;
                for (size_t k = 0; k < n; ++k) {
                    C[i][j] += A[i][k] * B[k][j];
                }
            }
        }
    }
};

SIMD优化

#include <immintrin.h>

class SIMDOptimization {
public:
    // 使用AVX2进行向量化
    void vectorizedAdd(const float* a, const float* b, float* result, size_t n) {
        size_t i = 0;
        
        // 处理8个float的向量
        for (; i + 8 <= n; i += 8) {
            __m256 va = _mm256_load_ps(&a[i]);
            __m256 vb = _mm256_load_ps(&b[i]);
            __m256 vresult = _mm256_add_ps(va, vb);
            _mm256_store_ps(&result[i], vresult);
        }
        
        // 处理剩余元素
        for (; i < n; ++i) {
            result[i] = a[i] + b[i];
        }
    }
    
    // 使用SSE进行向量化
    void sseVectorizedMultiply(const float* a, const float* b, float* result, size_t n) {
        size_t i = 0;
        
        // 处理4个float的向量
        for (; i + 4 <= n; i += 4) {
            __m128 va = _mm_load_ps(&a[i]);
            __m128 vb = _mm_load_ps(&b[i]);
            __m128 vresult = _mm_mul_ps(va, vb);
            _mm_store_ps(&result[i], vresult);
        }
        
        // 处理剩余元素
        for (; i < n; ++i) {
            result[i] = a[i] * b[i];
        }
    }
};

性能分析工具

基准测试

#include <chrono>
#include <iostream>

class Benchmark {
public:
    template<typename Func>
    static double measureTime(Func&& func) {
        auto start = std::chrono::high_resolution_clock::now();
        func();
        auto end = std::chrono::high_resolution_clock::now();
        
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
        return duration.count() / 1000.0;  // 返回毫秒
    }
    
    template<typename Func>
    static void comparePerformance(const std::string& name1, Func&& func1,
                                  const std::string& name2, Func&& func2,
                                  int iterations = 1000) {
        double time1 = 0, time2 = 0;
        
        for (int i = 0; i < iterations; ++i) {
            time1 += measureTime(func1);
            time2 += measureTime(func2);
        }
        
        time1 /= iterations;
        time2 /= iterations;
        
        std::cout << name1 << ": " << time1 << " ms" << std::endl;
        std::cout << name2 << ": " << time2 << " ms" << std::endl;
        std::cout << "Speedup: " << time1 / time2 << "x" << std::endl;
    }
};

性能计数器

#include <chrono>
#include <map>
#include <string>

class PerformanceCounter {
private:
    std::map<std::string, double> counters;
    std::map<std::string, std::chrono::high_resolution_clock::time_point> timers;
    
public:
    void startTimer(const std::string& name) {
        timers[name] = std::chrono::high_resolution_clock::now();
    }
    
    void endTimer(const std::string& name) {
        auto end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
            end - timers[name]).count();
        counters[name] = duration / 1000.0;  // 毫秒
    }
    
    void addCounter(const std::string& name, double value) {
        counters[name] += value;
    }
    
    void printReport() {
        std::cout << "Performance Report:" << std::endl;
        for (const auto& pair : counters) {
            std::cout << pair.first << ": " << pair.second << " ms" << std::endl;
        }
    }
};

CPU缓存优化

缓存局部性原理

现代CPU有多级缓存（L1/L2/L3），访问延迟差异巨大：

L1缓存：~1ns
L2缓存：~3ns
L3缓存：~10ns
主内存：~100ns

优化目标：提高缓存命中率

空间局部性优化

// ❌ 缓存不友好：结构体数组（AoS）
struct Particle {
    float x, y, z;     // 位置
    float vx, vy, vz;  // 速度
    float r, g, b;     // 颜色
};

Particle particles[10000];

void updatePosition() {
    for (int i = 0; i < 10000; ++i) {
        particles[i].x += particles[i].vx;  // 加载整个struct，浪费缓存
        particles[i].y += particles[i].vy;
        particles[i].z += particles[i].vz;
    }
}

// ✅ 缓存友好：数组结构体（SoA）
struct Particles {
    float x[10000], y[10000], z[10000];      // 位置
    float vx[10000], vy[10000], vz[10000];   // 速度
    float r[10000], g[10000], b[10000];      // 颜色
};

Particles particles;

void updatePosition() {
    for (int i = 0; i < 10000; ++i) {
        particles.x[i] += particles.vx[i];  // 连续访问，缓存友好
        particles.y[i] += particles.vy[i];
        particles.z[i] += particles.vz[i];
    }
}

时间局部性优化

// ❌ 缓存不友好：列优先访问
int matrix[1000][1000];

void sumColumns() {
    for (int col = 0; col < 1000; ++col) {
        for (int row = 0; row < 1000; ++row) {
            sum += matrix[row][col];  // 跳跃访问，缓存miss
        }
    }
}

// ✅ 缓存友好：行优先访问
void sumRows() {
    for (int row = 0; row < 1000; ++row) {
        for (int col = 0; col < 1000; ++col) {
            sum += matrix[row][col];  // 连续访问，缓存命中
        }
    }
}

// 性能差异：可达10倍以上！

数据对齐优化

// ❌ 未对齐：跨缓存行
struct Unaligned {
    char c;      // 1字节
    int i;       // 4字节，可能跨缓存行
    double d;    // 8字节
};  // 可能24字节（填充）

// ✅ 对齐优化
struct Aligned {
    double d;    // 8字节（对齐到8）
    int i;       // 4字节
    char c;      // 1字节
};  // 16字节

// ✅ 缓存行对齐（避免伪共享）
struct alignas(64) CacheLine {  // 64字节对齐（典型缓存行大小）
    std::atomic<int> counter;
    char padding[60];  // 填充到64字节
};

// 多线程场景：不同线程的数据分布在不同缓存行
CacheLine counters[8];  // 8个线程，避免伪共享

分支预测优化

分支预测原理

现代CPU使用分支预测器猜测分支走向，预测错误导致流水线清空（~10-20周期损失）。

// ❌ 难预测的分支
void process(std::vector<int>& data) {
    for (int x : data) {
        if (x % 2 == 0) {  // 50/50概率，难预测
            // ...
        } else {
            // ...
        }
    }
}

// ✅ 优化1：排序后分支可预测
void process(std::vector<int>& data) {
    std::sort(data.begin(), data.end());  // 先排序
    
    for (int x : data) {
        if (x < threshold) {  // 连续true或false，易预测
            // ...
        } else {
            // ...
        }
    }
}

// ✅ 优化2：branchless编程
int max(int a, int b) {
    // 传统：if (a > b) return a; else return b;
    return a > b ? a : b;  // 可能编译为branchless代码
}

int abs(int x) {
    // 传统：if (x < 0) return -x; else return x;
    int mask = x >> 31;  // branchless
    return (x + mask) ^ mask;
}

// ✅ 优化3：likely/unlikely提示（GCC）
void process(int x) {
    if (__builtin_expect(x > 0, 1)) {  // 提示通常为真
        // 热路径
    } else {
        // 冷路径
    }
}

性能对比

// 测试：100万随机数求和（偶数乘2）
// 未排序：~15ms（分支预测失败50%）
// 已排序：~5ms （分支预测成功100%）
// Branchless：~3ms（无分支）

优化建议：

避免随机分支：预处理数据，使分支可预测
热路径优先：常执行的分支放在if而非else
消除短循环分支：循环展开或branchless
Profile指导：用perf查看分支预测失败率