Scala code to generate books data

2025年3月12日 作者 unix2go

The following code was used to generate a CSV which contains 100,000 random books. You can use it to learn big data stacks like Spark and Hive.

import scala.util.Random
import java.io.{File, PrintWriter}
import java.text.DecimalFormat

object GenerateBookCsvData {
  def main(args: Array[String]): Unit = {
    val writer = new PrintWriter(new File("computer_books.csv"))
    
    // 写入标题行
    writer.println("title,publisher,year,author,country,price,category,edition")
    
    // 创建随机数据生成器
    val random = new Random()
    
    // 定义可能的数据
    val titleFirstParts = Seq(
      "Programming with", "Mastering", "Learning", "Practical", "Advanced", 
      "The Art of", "Dive into", "Essential", "Professional", "The Complete Guide to",
      "Understanding", "Exploring", "Fundamentals of", "Hands-On", "Beginning",
      "Expert", "Modern", "Effective", "Introduction to", "Ultimate"
    )
    
    val titleSubjects = Seq(
      "Scala", "Python", "Java", "C++", "JavaScript", "Ruby", "Go", 
      "Rust", "Swift", "Kotlin", "TypeScript", "PHP", "Perl", "R", 
      "C#", "SQL", "HTML5", "CSS3", "React", "Angular", "Vue.js", 
      "Node.js", "Docker", "Kubernetes", "Apache Spark", "Hadoop", 
      "MongoDB", "PostgreSQL", "Redis", "ElasticSearch", "Machine Learning",
      "Deep Learning", "TensorFlow", "PyTorch", "Data Science", "Artificial Intelligence",
      "Web Development", "Mobile Development", "Cloud Computing", "DevOps",
      "Microservices", "RESTful APIs", "GraphQL", "Blockchain", "Cybersecurity"
    )
    
    val publishers = Seq(
      "O'Reilly", "Apress", "Packt Publishing", "Manning Publications", "Wiley",
      "Addison-Wesley", "No Starch Press", "Pragmatic Bookshelf", "Springer",
      "MIT Press", "Cambridge University Press", "CRC Press", "McGraw-Hill",
      "Morgan Kaufmann", "Wrox Press", "Peachpit Press", "Sams Publishing",
      "Microsoft Press", "Academic Press", "Pearson Education"
    )
    
    val firstNames = Seq(
      "John", "Jane", "Michael", "Sarah", "David", "Lisa", "Robert", "Emily",
      "James", "Jennifer", "William", "Elizabeth", "Richard", "Jessica", "Thomas",
      "Susan", "Joseph", "Laura", "Charles", "Karen", "Daniel", "Nancy", "Matthew",
      "Linda", "Anthony", "Patricia", "Mark", "Barbara", "Donald", "Maria", "Steven",
      "Catherine", "Paul", "Margaret", "Andrew", "Sandra", "Joshua", "Ashley", "Brian",
      "Michelle", "Kevin", "Amanda", "George", "Melissa", "Edward", "Stephanie", "Anna"
    )
    
    val lastNames = Seq(
      "Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson",
      "Moore", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin",
      "Thompson", "Garcia", "Martinez", "Robinson", "Clark", "Rodriguez", "Lewis", "Lee",
      "Walker", "Hall", "Allen", "Young", "Hernandez", "King", "Wright", "Lopez",
      "Hill", "Scott", "Green", "Adams", "Baker", "Nelson", "Carter", "Mitchell",
      "Perez", "Roberts", "Turner", "Phillips", "Campbell", "Parker", "Evans", "Edwards"
    )
    
    val countries = Seq(
      "USA", "UK", "Canada", "Germany", "France", "Australia", "Japan", "India",
      "China", "Brazil", "Italy", "Netherlands", "Sweden", "Switzerland", "Spain",
      "Russia", "South Korea", "Singapore", "Israel", "Denmark", "Finland", "Norway",
      "Belgium", "Austria", "Ireland", "New Zealand", "Portugal", "Poland"
    )
    
    val categories = Seq(
      "programming", "database", "cloud computing", "big data", "machine learning", 
      "networking", "engineering", "distributed systems", "security", "algorithm"
    )
    
    // 新增:版本
    val editions = Seq(
      "First", "Second", "Third", "Fourth", "Fifth", 
      "Sixth", "Seventh", "Eighth", "Ninth", "Tenth"
    )
    
    // 生成数据并写入CSV
    val totalRows = 100000
    val decimalFormat = new DecimalFormat("0.00")
    
    for (i <- 1 to totalRows) {
      val titleFirstPart = titleFirstParts(random.nextInt(titleFirstParts.length))
      val titleSubject = titleSubjects(random.nextInt(titleSubjects.length))
      val title = s"$titleFirstPart $titleSubject"
      
      val publisher = publishers(random.nextInt(publishers.length))
      val year = 2000 + random.nextInt(24) // 2000到2023年
      
      val firstName = firstNames(random.nextInt(firstNames.length))
      val lastName = lastNames(random.nextInt(lastNames.length))
      val author = s"$firstName $lastName"
      
      val country = countries(random.nextInt(countries.length))
      val price = 9.99 + random.nextDouble() * 90.0 // 9.99到99.99之间
      val formattedPrice = decimalFormat.format(price)
      
      val category = categories(random.nextInt(categories.length))
      
      // 添加版本,根据书龄调整版本的可能性(较旧的书有更高机会是更高版本)
      val bookAge = 2024 - year
      // 使用加权随机,书龄越大,可能的版本越高
      val editionWeight = Math.min(9, Math.max(0, bookAge / 3))
      val editionIndex = random.nextInt(editionWeight + 1)
      val edition = editions(editionIndex)
      
      // 组合一行CSV数据,确保标题和作者名中的逗号被处理
      val csvLine = s"\"${title}\",\"${publisher}\",${year},\"${author}\",\"${country}\",${formattedPrice},${category},${edition}"
      writer.println(csvLine)
      
      // 每10000行打印一次进度
      if (i % 10000 == 0) {
        println(s"Generated $i rows")
      }
    }
    
    writer.close()
    println(s"Generated $totalRows rows of data in computer_books.csv")
  }
}