{"id":"6f414ee3-a0a2-43d3-8795-8c2fb746eb1b","shortId":"d6jUP4","kind":"skill","title":"Apache Spark DataFrame ETL Pipeline","tagline":"Automates PySpark DataFrame transformations including schema inference, partition pruning, and Delta Lake merge operations. Integrates with AWS Glue Data Catalog and Apache Iceberg table formats for lakehouse architectures.","description":"# Apache Spark DataFrame ETL Pipeline\n\nAutomates PySpark DataFrame transformations including schema inference, partition pruning, and Delta Lake merge operations. Integrates with AWS Glue Data Catalog and Apache Iceberg table formats for lakehouse architectures.\n\n## Installation\n\nRequirements and caveats from upstream:\n- high-level APIs in Scala, Java, Python, and R (Deprecated), and an optimized engine that\n- ## Interactive Python Shell\n- Alternatively, if you prefer Python, you can use the Python shell:\n\nBasic usage or getting-started notes:\n- To build Spark and its example programs, run:\n- And run the following command, which should also return 1,000,000,000:\n- ## Example Programs\n\n- Source: https://github.com/apache/spark\n- Extracted from upstream docs: https://raw.githubusercontent.com/apache/spark/HEAD/README.md\n\n## Source\n\n- [Agent Skill Exchange](https://agentskillexchange.com/skills/spark-dataframe-etl-pipeline/)","tags":["spark","dataframe","etl","pipeline","skills","agentskillexchange","agent-skills","ai-agents","ai-tools","awesome-list","claude-code","codex"],"capabilities":["skill","source-agentskillexchange","skill-spark-dataframe-etl-pipeline","topic-agent-skills","topic-ai-agents","topic-ai-tools","topic-awesome-list","topic-claude-code","topic-codex","topic-cursor","topic-llm","topic-mcp","topic-npx-skills","topic-openclaw","topic-skills-catalog"],"categories":["skills"],"synonyms":[],"warnings":[],"endpointUrl":"https://skills.sh/agentskillexchange/skills/spark-dataframe-etl-pipeline","protocol":"skill","transport":"skills-sh","auth":{"type":"none","details":{"cli":"npx skills add agentskillexchange/skills","source_repo":"https://github.com/agentskillexchange/skills","install_from":"skills.sh"}},"qualityScore":"0.454","qualityRationale":"deterministic score 0.45 from registry signals: · indexed on github topic:agent-skills · 8 github stars · SKILL.md body (935 chars)","verified":false,"liveness":"unknown","lastLivenessCheck":null,"agentReviews":{"count":0,"score_avg":null,"cost_usd_avg":null,"success_rate":null,"latency_p50_ms":null,"narrative_summary":null,"summary_updated_at":null},"enrichmentModel":"deterministic:skill-github:v1","enrichmentVersion":1,"enrichedAt":"2026-05-18T19:12:35.053Z","embedding":null,"createdAt":"2026-05-18T13:19:34.353Z","updatedAt":"2026-05-18T19:12:35.053Z","lastSeenAt":"2026-05-18T19:12:35.053Z","tsv":"'/apache/spark':136 '/apache/spark/head/readme.md':143 '/skills/spark-dataframe-etl-pipeline/)':150 '000':128,129,130 '1':127 'agent':145 'agentskillexchange.com':149 'agentskillexchange.com/skills/spark-dataframe-etl-pipeline/)':148 'also':125 'altern':92 'apach':1,27,34,60 'api':76 'architectur':33,66 'autom':6,39 'aw':22,55 'basic':103 'build':111 'catalog':25,58 'caveat':70 'command':122 'data':24,57 'datafram':3,8,36,41 'delta':16,49 'deprec':83 'doc':140 'engin':87 'etl':4,37 'exampl':115,131 'exchang':147 'extract':137 'follow':121 'format':30,63 'get':107 'getting-start':106 'github.com':135 'github.com/apache/spark':134 'glue':23,56 'high':74 'high-level':73 'iceberg':28,61 'includ':10,43 'infer':12,45 'instal':67 'integr':20,53 'interact':89 'java':79 'lake':17,50 'lakehous':32,65 'level':75 'merg':18,51 'note':109 'oper':19,52 'optim':86 'partit':13,46 'pipelin':5,38 'prefer':95 'program':116,132 'prune':14,47 'pyspark':7,40 'python':80,90,96,101 'r':82 'raw.githubusercontent.com':142 'raw.githubusercontent.com/apache/spark/head/readme.md':141 'requir':68 'return':126 'run':117,119 'scala':78 'schema':11,44 'shell':91,102 'skill':146 'skill-spark-dataframe-etl-pipeline' 'sourc':133,144 'source-agentskillexchange' 'spark':2,35,112 'start':108 'tabl':29,62 'topic-agent-skills' 'topic-ai-agents' 'topic-ai-tools' 'topic-awesome-list' 'topic-claude-code' 'topic-codex' 'topic-cursor' 'topic-llm' 'topic-mcp' 'topic-npx-skills' 'topic-openclaw' 'topic-skills-catalog' 'transform':9,42 'upstream':72,139 'usag':104 'use':99","prices":[{"id":"b66da741-8236-426c-bafb-464dddebe639","listingId":"6f414ee3-a0a2-43d3-8795-8c2fb746eb1b","amountUsd":"0","unit":"free","nativeCurrency":null,"nativeAmount":null,"chain":null,"payTo":null,"paymentMethod":"skill-free","isPrimary":true,"details":{"org":"agentskillexchange","category":"skills","install_from":"skills.sh"},"createdAt":"2026-05-18T13:19:34.353Z"}],"sources":[{"listingId":"6f414ee3-a0a2-43d3-8795-8c2fb746eb1b","source":"github","sourceId":"agentskillexchange/skills/spark-dataframe-etl-pipeline","sourceUrl":"https://github.com/agentskillexchange/skills/tree/main/skills/spark-dataframe-etl-pipeline","isPrimary":false,"firstSeenAt":"2026-05-18T13:19:34.353Z","lastSeenAt":"2026-05-18T19:12:35.053Z"}],"details":{"listingId":"6f414ee3-a0a2-43d3-8795-8c2fb746eb1b","quickStartSnippet":null,"exampleRequest":null,"exampleResponse":null,"schema":null,"openapiUrl":null,"agentsTxtUrl":null,"citations":[],"useCases":[],"bestFor":[],"notFor":[],"kindDetails":{"org":"agentskillexchange","slug":"spark-dataframe-etl-pipeline","github":{"repo":"agentskillexchange/skills","stars":8,"topics":["agent-skills","ai-agents","ai-tools","awesome-list","claude-code","codex","cursor","llm","mcp","npx-skills","openclaw","skills-catalog"],"license":"mit","html_url":"https://github.com/agentskillexchange/skills","pushed_at":"2026-05-18T19:02:17Z","description":"The open catalog of AI agent skills — 2,000+ security-scanned skills for Claude Code, Cursor, Codex, and more.","skill_md_sha":"7b4aa2c8a3caf6bfc4fded00cbd897ad7ea68143","skill_md_path":"skills/spark-dataframe-etl-pipeline/SKILL.md","default_branch":"main","skill_tree_url":"https://github.com/agentskillexchange/skills/tree/main/skills/spark-dataframe-etl-pipeline"},"layout":"multi","source":"github","category":"skills","frontmatter":{"name":"Apache Spark DataFrame ETL Pipeline","description":"Automates PySpark DataFrame transformations including schema inference, partition pruning, and Delta Lake merge operations. Integrates with AWS Glue Data Catalog and Apache Iceberg table formats for lakehouse architectures."},"skills_sh_url":"https://skills.sh/agentskillexchange/skills/spark-dataframe-etl-pipeline"},"updatedAt":"2026-05-18T19:12:35.053Z"}}