Scala code to generate books data
2025年3月12日The following code was used to generate a CSV which contains 100,000 random books. You can use it to learn big data stacks like Spark and Hive.
import scala.util.Random
import java.io.{File, PrintWriter}
import java.text.DecimalFormat
object GenerateBookCsvData {
def main(args: Array[String]): Unit = {
val writer = new PrintWriter(new File("computer_books.csv"))
// 写入标题行
writer.println("title,publisher,year,author,country,price,category,edition")
// 创建随机数据生成器
val random = new Random()
// 定义可能的数据
val titleFirstParts = Seq(
"Programming with", "Mastering", "Learning", "Practical", "Advanced",
"The Art of", "Dive into", "Essential", "Professional", "The Complete Guide to",
"Understanding", "Exploring", "Fundamentals of", "Hands-On", "Beginning",
"Expert", "Modern", "Effective", "Introduction to", "Ultimate"
)
val titleSubjects = Seq(
"Scala", "Python", "Java", "C++", "JavaScript", "Ruby", "Go",
"Rust", "Swift", "Kotlin", "TypeScript", "PHP", "Perl", "R",
"C#", "SQL", "HTML5", "CSS3", "React", "Angular", "Vue.js",
"Node.js", "Docker", "Kubernetes", "Apache Spark", "Hadoop",
"MongoDB", "PostgreSQL", "Redis", "ElasticSearch", "Machine Learning",
"Deep Learning", "TensorFlow", "PyTorch", "Data Science", "Artificial Intelligence",
"Web Development", "Mobile Development", "Cloud Computing", "DevOps",
"Microservices", "RESTful APIs", "GraphQL", "Blockchain", "Cybersecurity"
)
val publishers = Seq(
"O'Reilly", "Apress", "Packt Publishing", "Manning Publications", "Wiley",
"Addison-Wesley", "No Starch Press", "Pragmatic Bookshelf", "Springer",
"MIT Press", "Cambridge University Press", "CRC Press", "McGraw-Hill",
"Morgan Kaufmann", "Wrox Press", "Peachpit Press", "Sams Publishing",
"Microsoft Press", "Academic Press", "Pearson Education"
)
val firstNames = Seq(
"John", "Jane", "Michael", "Sarah", "David", "Lisa", "Robert", "Emily",
"James", "Jennifer", "William", "Elizabeth", "Richard", "Jessica", "Thomas",
"Susan", "Joseph", "Laura", "Charles", "Karen", "Daniel", "Nancy", "Matthew",
"Linda", "Anthony", "Patricia", "Mark", "Barbara", "Donald", "Maria", "Steven",
"Catherine", "Paul", "Margaret", "Andrew", "Sandra", "Joshua", "Ashley", "Brian",
"Michelle", "Kevin", "Amanda", "George", "Melissa", "Edward", "Stephanie", "Anna"
)
val lastNames = Seq(
"Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson",
"Moore", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin",
"Thompson", "Garcia", "Martinez", "Robinson", "Clark", "Rodriguez", "Lewis", "Lee",
"Walker", "Hall", "Allen", "Young", "Hernandez", "King", "Wright", "Lopez",
"Hill", "Scott", "Green", "Adams", "Baker", "Nelson", "Carter", "Mitchell",
"Perez", "Roberts", "Turner", "Phillips", "Campbell", "Parker", "Evans", "Edwards"
)
val countries = Seq(
"USA", "UK", "Canada", "Germany", "France", "Australia", "Japan", "India",
"China", "Brazil", "Italy", "Netherlands", "Sweden", "Switzerland", "Spain",
"Russia", "South Korea", "Singapore", "Israel", "Denmark", "Finland", "Norway",
"Belgium", "Austria", "Ireland", "New Zealand", "Portugal", "Poland"
)
val categories = Seq(
"programming", "database", "cloud computing", "big data", "machine learning",
"networking", "engineering", "distributed systems", "security", "algorithm"
)
// 新增:版本
val editions = Seq(
"First", "Second", "Third", "Fourth", "Fifth",
"Sixth", "Seventh", "Eighth", "Ninth", "Tenth"
)
// 生成数据并写入CSV
val totalRows = 100000
val decimalFormat = new DecimalFormat("0.00")
for (i <- 1 to totalRows) {
val titleFirstPart = titleFirstParts(random.nextInt(titleFirstParts.length))
val titleSubject = titleSubjects(random.nextInt(titleSubjects.length))
val title = s"$titleFirstPart $titleSubject"
val publisher = publishers(random.nextInt(publishers.length))
val year = 2000 + random.nextInt(24) // 2000到2023年
val firstName = firstNames(random.nextInt(firstNames.length))
val lastName = lastNames(random.nextInt(lastNames.length))
val author = s"$firstName $lastName"
val country = countries(random.nextInt(countries.length))
val price = 9.99 + random.nextDouble() * 90.0 // 9.99到99.99之间
val formattedPrice = decimalFormat.format(price)
val category = categories(random.nextInt(categories.length))
// 添加版本,根据书龄调整版本的可能性(较旧的书有更高机会是更高版本)
val bookAge = 2024 - year
// 使用加权随机,书龄越大,可能的版本越高
val editionWeight = Math.min(9, Math.max(0, bookAge / 3))
val editionIndex = random.nextInt(editionWeight + 1)
val edition = editions(editionIndex)
// 组合一行CSV数据,确保标题和作者名中的逗号被处理
val csvLine = s"\"${title}\",\"${publisher}\",${year},\"${author}\",\"${country}\",${formattedPrice},${category},${edition}"
writer.println(csvLine)
// 每10000行打印一次进度
if (i % 10000 == 0) {
println(s"Generated $i rows")
}
}
writer.close()
println(s"Generated $totalRows rows of data in computer_books.csv")
}
}